diff options
Diffstat (limited to 'fs')
224 files changed, 4291 insertions, 2843 deletions
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index e78956cbd70..34c59f14a1c 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -144,7 +144,7 @@ extern void v9fs_session_close(struct v9fs_session_info *v9ses); extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nameidata); + unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index d529437ff44..64600b5d052 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -100,13 +100,13 @@ static void v9fs_dentry_release(struct dentry *dentry) } } -static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) +static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { struct p9_fid *fid; struct inode *inode; struct v9fs_inode *v9inode; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 57ccb7537da..cbf9dbb1b2a 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -712,88 +712,34 @@ error: } /** - * v9fs_vfs_create - VFS hook to create files + * v9fs_vfs_create - VFS hook to create a regular file + * + * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called + * for mknod(2). + * * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @mode: create permissions - * @nd: path information * */ static int v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { - int err; - u32 perm; - int flags; - struct file *filp; - struct v9fs_inode *v9inode; - struct v9fs_session_info *v9ses; - struct p9_fid *fid, *inode_fid; - - err = 0; - fid = NULL; - v9ses = v9fs_inode2v9ses(dir); - perm = unixmode2p9mode(v9ses, mode); - if (nd) - flags = nd->intent.open.flags; - else - flags = O_RDWR; + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); + u32 perm = unixmode2p9mode(v9ses, mode); + struct p9_fid *fid; - fid = v9fs_create(v9ses, dir, dentry, NULL, perm, - v9fs_uflags2omode(flags, - v9fs_proto_dotu(v9ses))); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - fid = NULL; - goto error; - } + /* P9_OEXCL? */ + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, P9_ORDWR); + if (IS_ERR(fid)) + return PTR_ERR(fid); v9fs_invalidate_inode_attr(dir); - /* if we are opening a file, assign the open fid to the file */ - if (nd) { - v9inode = V9FS_I(dentry->d_inode); - mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && - ((flags & O_ACCMODE) != O_RDONLY)) { - /* - * clone a fid and add it to writeback_fid - * we do it during open time instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - inode_fid = v9fs_writeback_fid(dentry); - if (IS_ERR(inode_fid)) { - err = PTR_ERR(inode_fid); - mutex_unlock(&v9inode->v_mutex); - goto error; - } - v9inode->writeback_fid = (void *) inode_fid; - } - mutex_unlock(&v9inode->v_mutex); - filp = lookup_instantiate_filp(nd, dentry, generic_file_open); - if (IS_ERR(filp)) { - err = PTR_ERR(filp); - goto error; - } - - filp->private_data = fid; -#ifdef CONFIG_9P_FSCACHE - if (v9ses->cache) - v9fs_cache_inode_set_cookie(dentry->d_inode, filp); -#endif - } else - p9_client_clunk(fid); + p9_client_clunk(fid); return 0; - -error: - if (fid) - p9_client_clunk(fid); - - return err; } /** @@ -839,7 +785,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode */ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nameidata) + unsigned int flags) { struct dentry *res; struct super_block *sb; @@ -849,8 +795,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, char *name; int result = 0; - p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", - dir, dentry->d_name.name, dentry, nameidata); + p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n", + dir, dentry->d_name.name, dentry, flags); if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -910,6 +856,86 @@ error: return ERR_PTR(result); } +static int +v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened) +{ + int err; + u32 perm; + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *inode_fid; + struct dentry *res = NULL; + + if (d_unhashed(dentry)) { + res = v9fs_vfs_lookup(dir, dentry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + dentry = res; + } + + /* Only creates */ + if (!(flags & O_CREAT) || dentry->d_inode) + return finish_no_open(file, res); + + err = 0; + fid = NULL; + v9ses = v9fs_inode2v9ses(dir); + perm = unixmode2p9mode(v9ses, mode); + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, + v9fs_uflags2omode(flags, + v9fs_proto_dotu(v9ses))); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; + goto error; + } + + v9fs_invalidate_inode_attr(dir); + v9inode = V9FS_I(dentry->d_inode); + mutex_lock(&v9inode->v_mutex); + if (v9ses->cache && !v9inode->writeback_fid && + ((flags & O_ACCMODE) != O_RDONLY)) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + mutex_unlock(&v9inode->v_mutex); + goto error; + } + v9inode->writeback_fid = (void *) inode_fid; + } + mutex_unlock(&v9inode->v_mutex); + err = finish_open(file, dentry, generic_file_open, opened); + if (err) + goto error; + + file->private_data = fid; +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) + v9fs_cache_inode_set_cookie(dentry->d_inode, file); +#endif + + *opened |= FILE_CREATED; +out: + dput(res); + return err; + +error: + if (fid) + p9_client_clunk(fid); + goto out; +} + /** * v9fs_vfs_unlink - VFS unlink hook to delete an inode * @i: inode that is being unlinked @@ -1488,6 +1514,7 @@ out: static const struct inode_operations v9fs_dir_inode_operations_dotu = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, + .atomic_open = v9fs_vfs_atomic_open, .symlink = v9fs_vfs_symlink, .link = v9fs_vfs_link, .unlink = v9fs_vfs_unlink, @@ -1502,6 +1529,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = { static const struct inode_operations v9fs_dir_inode_operations = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, + .atomic_open = v9fs_vfs_atomic_open, .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir, .rmdir = v9fs_vfs_rmdir, diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index e3dd2a1e2bf..40895546e10 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -230,20 +230,25 @@ int v9fs_open_to_dotl_flags(int flags) * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @mode: create permissions - * @nd: path information * */ static int v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, - struct nameidata *nd) + bool excl) +{ + return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); +} + +static int +v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t omode, + int *opened) { int err = 0; gid_t gid; - int flags; umode_t mode; char *name = NULL; - struct file *filp; struct p9_qid qid; struct inode *inode; struct p9_fid *fid = NULL; @@ -251,19 +256,23 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, struct p9_fid *dfid, *ofid, *inode_fid; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; + struct dentry *res = NULL; - v9ses = v9fs_inode2v9ses(dir); - if (nd) - flags = nd->intent.open.flags; - else { - /* - * create call without LOOKUP_OPEN is due - * to mknod of regular files. So use mknod - * operation. - */ - return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); + if (d_unhashed(dentry)) { + res = v9fs_vfs_lookup(dir, dentry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + dentry = res; } + /* Only creates */ + if (!(flags & O_CREAT) || dentry->d_inode) + return finish_no_open(file, res); + + v9ses = v9fs_inode2v9ses(dir); + name = (char *) dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", name, flags, omode); @@ -272,7 +281,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); - return err; + goto out; } /* clone a fid to use for creation */ @@ -280,7 +289,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - return err; + goto out; } gid = v9fs_get_fsgid_for_create(dir); @@ -345,17 +354,18 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, } mutex_unlock(&v9inode->v_mutex); /* Since we are opening a file, assign the open fid to the file */ - filp = lookup_instantiate_filp(nd, dentry, generic_file_open); - if (IS_ERR(filp)) { - err = PTR_ERR(filp); + err = finish_open(file, dentry, generic_file_open, opened); + if (err) goto err_clunk_old_fid; - } - filp->private_data = ofid; + file->private_data = ofid; #ifdef CONFIG_9P_FSCACHE if (v9ses->cache) - v9fs_cache_inode_set_cookie(inode, filp); + v9fs_cache_inode_set_cookie(inode, file); #endif - return 0; + *opened |= FILE_CREATED; +out: + dput(res); + return err; error: if (fid) @@ -364,7 +374,7 @@ err_clunk_old_fid: if (ofid) p9_client_clunk(ofid); v9fs_set_create_acl(NULL, &dacl, &pacl); - return err; + goto out; } /** @@ -982,6 +992,7 @@ out: const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create_dotl, + .atomic_open = v9fs_vfs_atomic_open_dotl, .lookup = v9fs_vfs_lookup, .link = v9fs_vfs_link_dotl, .symlink = v9fs_vfs_symlink_dotl, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 8c92a9ba833..137d5039689 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -89,7 +89,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE; - sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; + sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; if (!v9ses->cache) sb->s_flags |= MS_SYNCHRONOUS; @@ -137,7 +137,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto close_session; } - sb = sget(fs_type, NULL, v9fs_set_super, v9ses); + sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses); if (IS_ERR(sb)) { retval = PTR_ERR(sb); goto clunk_fid; diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 3d83075aaa2..b3be2e7c564 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -266,7 +266,7 @@ const struct dentry_operations adfs_dentry_operations = { }; static struct dentry * -adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; struct object_info obj; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 06fdcc9382c..bdaec92353c 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -246,7 +246,6 @@ static struct inode *adfs_alloc_inode(struct super_block *sb) static void adfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 1fceb320d2f..6e216419f34 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -3,6 +3,7 @@ #include <linux/buffer_head.h> #include <linux/amigaffs.h> #include <linux/mutex.h> +#include <linux/workqueue.h> /* AmigaOS allows file names with up to 30 characters length. * Names longer than that will be silently truncated. If you @@ -100,6 +101,10 @@ struct affs_sb_info { char *s_prefix; /* Prefix for volumes and assigns. */ char s_volume[32]; /* Volume prefix for absolute symlinks. */ spinlock_t symlink_lock; /* protects the previous two */ + struct super_block *sb; /* the VFS superblock object */ + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work sb_work; /* superblock flush delayed work */ + spinlock_t work_lock; /* protects sb_work and work_queued */ }; #define SF_INTL 0x0001 /* International filesystem. */ @@ -120,6 +125,8 @@ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) return sb->s_fs_info; } +void affs_mark_sb_dirty(struct super_block *sb); + /* amigaffs.c */ extern int affs_insert_hash(struct inode *inode, struct buffer_head *bh); @@ -146,9 +153,9 @@ extern void affs_free_bitmap(struct super_block *sb); /* namei.c */ extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); -extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *); +extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *); +extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool); extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 52a6407682e..eb82ee53ee0 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -122,22 +122,16 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) } static void -affs_fix_dcache(struct dentry *dentry, u32 entry_ino) +affs_fix_dcache(struct inode *inode, u32 entry_ino) { - struct inode *inode = dentry->d_inode; - void *data = dentry->d_fsdata; - struct list_head *head, *next; - + struct dentry *dentry; + struct hlist_node *p; spin_lock(&inode->i_lock); - head = &inode->i_dentry; - next = head->next; - while (next != head) { - dentry = list_entry(next, struct dentry, d_alias); + hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { if (entry_ino == (u32)(long)dentry->d_fsdata) { - dentry->d_fsdata = data; + dentry->d_fsdata = (void *)inode->i_ino; break; } - next = next->next; } spin_unlock(&inode->i_lock); } @@ -177,7 +171,11 @@ affs_remove_link(struct dentry *dentry) } affs_lock_dir(dir); - affs_fix_dcache(dentry, link_ino); + /* + * if there's a dentry for that block, make it + * refer to inode itself. + */ + affs_fix_dcache(inode, link_ino); retval = affs_remove_hash(dir, link_bh); if (retval) { affs_unlock_dir(dir); diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 3e262711ae0..6e0be43ef6e 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -103,7 +103,7 @@ affs_free_block(struct super_block *sb, u32 block) *(__be32 *)bh->b_data = cpu_to_be32(tmp - mask); mark_buffer_dirty(bh); - sb->s_dirt = 1; + affs_mark_sb_dirty(sb); bm->bm_free++; mutex_unlock(&sbi->s_bmlock); @@ -248,7 +248,7 @@ find_bit: *(__be32 *)bh->b_data = cpu_to_be32(tmp + mask); mark_buffer_dirty(bh); - sb->s_dirt = 1; + affs_mark_sb_dirty(sb); mutex_unlock(&sbi->s_bmlock); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 47806940aac..ff65884a783 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -211,7 +211,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) } struct dentry * -affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct super_block *sb = dir->i_sb; struct buffer_head *bh; @@ -255,7 +255,7 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) +affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode; diff --git a/fs/affs/super.c b/fs/affs/super.c index 0782653a05a..c70f1e5fc02 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -17,6 +17,7 @@ #include <linux/magic.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/writeback.h> #include "affs.h" extern struct timezone sys_tz; @@ -25,15 +26,17 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); static int affs_remount (struct super_block *sb, int *flags, char *data); static void -affs_commit_super(struct super_block *sb, int wait, int clean) +affs_commit_super(struct super_block *sb, int wait) { struct affs_sb_info *sbi = AFFS_SB(sb); struct buffer_head *bh = sbi->s_root_bh; struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh); - tail->bm_flag = cpu_to_be32(clean); + lock_buffer(bh); secs_to_datestamp(get_seconds(), &tail->disk_change); affs_fix_checksum(sb, bh); + unlock_buffer(bh); + mark_buffer_dirty(bh); if (wait) sync_dirty_buffer(bh); @@ -45,9 +48,7 @@ affs_put_super(struct super_block *sb) struct affs_sb_info *sbi = AFFS_SB(sb); pr_debug("AFFS: put_super()\n"); - if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt) - affs_commit_super(sb, 1, 1); - + cancel_delayed_work_sync(&sbi->sb_work); kfree(sbi->s_prefix); affs_free_bitmap(sb); affs_brelse(sbi->s_root_bh); @@ -55,26 +56,43 @@ affs_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -static void -affs_write_super(struct super_block *sb) +static int +affs_sync_fs(struct super_block *sb, int wait) { - lock_super(sb); - if (!(sb->s_flags & MS_RDONLY)) - affs_commit_super(sb, 1, 2); - sb->s_dirt = 0; - unlock_super(sb); + affs_commit_super(sb, wait); + return 0; +} + +static void flush_superblock(struct work_struct *work) +{ + struct affs_sb_info *sbi; + struct super_block *sb; + + sbi = container_of(work, struct affs_sb_info, sb_work.work); + sb = sbi->sb; - pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds()); + spin_lock(&sbi->work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->work_lock); + + affs_commit_super(sb, 1); } -static int -affs_sync_fs(struct super_block *sb, int wait) +void affs_mark_sb_dirty(struct super_block *sb) { - lock_super(sb); - affs_commit_super(sb, wait, 2); - sb->s_dirt = 0; - unlock_super(sb); - return 0; + struct affs_sb_info *sbi = AFFS_SB(sb); + unsigned long delay; + + if (sb->s_flags & MS_RDONLY) + return; + + spin_lock(&sbi->work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->sb_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->work_lock); } static struct kmem_cache * affs_inode_cachep; @@ -138,7 +156,6 @@ static const struct super_operations affs_sops = { .write_inode = affs_write_inode, .evict_inode = affs_evict_inode, .put_super = affs_put_super, - .write_super = affs_write_super, .sync_fs = affs_sync_fs, .statfs = affs_statfs, .remount_fs = affs_remount, @@ -305,8 +322,11 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; + sbi->sb = sb; mutex_init(&sbi->s_bmlock); spin_lock_init(&sbi->symlink_lock); + spin_lock_init(&sbi->work_lock); + INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock); if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, &blocksize,&sbi->s_prefix, @@ -531,6 +551,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } + flush_delayed_work_sync(&sbi->sb_work); replace_mount_options(sb, new_opts); sbi->s_flags = mount_flags; @@ -549,10 +570,9 @@ affs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; - if (*flags & MS_RDONLY) { - affs_write_super(sb); + if (*flags & MS_RDONLY) affs_free_bitmap(sb); - } else + else res = affs_init_bitmap(sb, flags); return res; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index e22dc4b4a50..db477906ba4 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -20,16 +20,16 @@ #include "internal.h" static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd); + unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); -static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd); +static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd); + bool excl); static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); @@ -516,7 +516,7 @@ out: * look up an entry in a directory */ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct afs_vnode *vnode; struct afs_fid fid; @@ -598,7 +598,7 @@ success: * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode */ -static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct afs_vnode *vnode, *dir; struct afs_fid uninitialized_var(fid); @@ -607,7 +607,7 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) void *dir_version; int ret; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; vnode = AFS_FS_I(dentry->d_inode); @@ -949,7 +949,7 @@ error: * create a regular file on an AFS filesystem */ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct afs_file_status status; struct afs_callback cb; diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 298cf8919ec..9682c33d5da 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -22,7 +22,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd); + unsigned int flags); static int afs_mntpt_open(struct inode *inode, struct file *file); static void afs_mntpt_expiry_timed_out(struct work_struct *work); @@ -104,7 +104,7 @@ out: */ static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { _enter("%p,%p{%p{%s},%s}", dir, diff --git a/fs/afs/super.c b/fs/afs/super.c index f02b31e7e64..df8c6047c2a 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -395,7 +395,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, as->volume = vol; /* allocate a deviceless superblock */ - sb = sget(fs_type, afs_test_super, afs_set_super, as); + sb = sget(fs_type, afs_test_super, afs_set_super, flags, as); if (IS_ERR(sb)) { ret = PTR_ERR(sb); afs_put_volume(vol); @@ -406,7 +406,6 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, if (!sb->s_root) { /* initial superblock/root creation */ _debug("create"); - sb->s_flags = flags; ret = afs_fill_super(sb, ¶ms); if (ret < 0) { deactivate_locked_super(sb); @@ -56,13 +56,6 @@ static struct kmem_cache *kioctx_cachep; static struct workqueue_struct *aio_wq; -/* Used for rare fput completion. */ -static void aio_fput_routine(struct work_struct *); -static DECLARE_WORK(fput_work, aio_fput_routine); - -static DEFINE_SPINLOCK(fput_lock); -static LIST_HEAD(fput_head); - static void aio_kick_handler(struct work_struct *); static void aio_queue_work(struct kioctx *); @@ -479,7 +472,6 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) { unsigned short allocated, to_alloc; long avail; - bool called_fput = false; struct kiocb *req, *n; struct aio_ring *ring; @@ -495,28 +487,11 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) if (allocated == 0) goto out; -retry: spin_lock_irq(&ctx->ctx_lock); ring = kmap_atomic(ctx->ring_info.ring_pages[0]); avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; BUG_ON(avail < 0); - if (avail == 0 && !called_fput) { - /* - * Handle a potential starvation case. It is possible that - * we hold the last reference on a struct file, causing us - * to delay the final fput to non-irq context. In this case, - * ctx->reqs_active is artificially high. Calling the fput - * routine here may free up a slot in the event completion - * ring, allowing this allocation to succeed. - */ - kunmap_atomic(ring); - spin_unlock_irq(&ctx->ctx_lock); - aio_fput_routine(NULL); - called_fput = true; - goto retry; - } - if (avail < allocated) { /* Trim back the number of requests. */ list_for_each_entry_safe(req, n, &batch->head, ki_batch) { @@ -570,36 +545,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) wake_up_all(&ctx->wait); } -static void aio_fput_routine(struct work_struct *data) -{ - spin_lock_irq(&fput_lock); - while (likely(!list_empty(&fput_head))) { - struct kiocb *req = list_kiocb(fput_head.next); - struct kioctx *ctx = req->ki_ctx; - - list_del(&req->ki_list); - spin_unlock_irq(&fput_lock); - - /* Complete the fput(s) */ - if (req->ki_filp != NULL) - fput(req->ki_filp); - - /* Link the iocb into the context's free list */ - rcu_read_lock(); - spin_lock_irq(&ctx->ctx_lock); - really_put_req(ctx, req); - /* - * at that point ctx might've been killed, but actual - * freeing is RCU'd - */ - spin_unlock_irq(&ctx->ctx_lock); - rcu_read_unlock(); - - spin_lock_irq(&fput_lock); - } - spin_unlock_irq(&fput_lock); -} - /* __aio_put_req * Returns true if this put was the last user of the request. */ @@ -618,21 +563,9 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) req->ki_cancel = NULL; req->ki_retry = NULL; - /* - * Try to optimize the aio and eventfd file* puts, by avoiding to - * schedule work in case it is not final fput() time. In normal cases, - * we would not be holding the last reference to the file*, so - * this function will be executed w/out any aio kthread wakeup. - */ - if (unlikely(!fput_atomic(req->ki_filp))) { - spin_lock(&fput_lock); - list_add(&req->ki_list, &fput_head); - spin_unlock(&fput_lock); - schedule_work(&fput_work); - } else { - req->ki_filp = NULL; - really_put_req(ctx, req); - } + fput(req->ki_filp); + req->ki_filp = NULL; + really_put_req(ctx, req); return 1; } diff --git a/fs/attr.c b/fs/attr.c index 0da90951d27..29e38a1f7f7 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -171,6 +171,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr) struct timespec now; unsigned int ia_valid = attr->ia_valid; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; @@ -250,5 +252,4 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return error; } - EXPORT_SYMBOL(notify_change); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index aa9103f8f01..abf645c1703 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -257,8 +257,8 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) * corresponding to the autofs fs we want to open. */ - filp = dentry_open(path.dentry, path.mnt, O_RDONLY, - current_cred()); + filp = dentry_open(&path, O_RDONLY, current_cred()); + path_put(&path); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 75e5f1c8e02..e7396cfdb10 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -32,7 +32,7 @@ static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); #endif static int autofs4_dir_open(struct inode *inode, struct file *file); -static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); +static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int); static struct vfsmount *autofs4_d_automount(struct path *); static int autofs4_d_manage(struct dentry *, bool); static void autofs4_dentry_release(struct dentry *); @@ -458,7 +458,7 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) } /* Lookups in the root directory */ -static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct autofs_sb_info *sbi; struct autofs_info *ino; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 1b35d6bd06b..b1342ffb3cf 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -173,13 +173,13 @@ static const struct file_operations bad_file_ops = }; static int bad_inode_create (struct inode *dir, struct dentry *dentry, - umode_t mode, struct nameidata *nd) + umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index e18da23d42b..cf7f3c67c8b 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -34,7 +34,7 @@ static int befs_readdir(struct file *, void *, filldir_t); static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); static int befs_readpage(struct file *file, struct page *page); static sector_t befs_bmap(struct address_space *mapping, sector_t block); -static struct dentry *befs_lookup(struct inode *, struct dentry *, struct nameidata *); +static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int); static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); @@ -159,7 +159,7 @@ befs_get_block(struct inode *inode, sector_t block, } static struct dentry * -befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; struct super_block *sb = dir->i_sb; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index d12c7966db2..2785ef91191 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -85,7 +85,7 @@ const struct file_operations bfs_dir_operations = { extern void dump_imap(const char *, struct super_block *); static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { int err; struct inode *inode; @@ -133,7 +133,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode = NULL; struct buffer_head *bh; diff --git a/fs/block_dev.c b/fs/block_dev.c index c2bbe1fb132..1e519195d45 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1710,3 +1710,39 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) return res; } EXPORT_SYMBOL(__invalidate_device); + +void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +{ + struct inode *inode, *old_inode = NULL; + + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { + struct address_space *mapping = inode->i_mapping; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || + mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ + iput(old_inode); + old_inode = inode; + + func(I_BDEV(inode), arg); + + spin_lock(&inode_sb_list_lock); + } + spin_unlock(&inode_sb_list_lock); + iput(old_inode); +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 01c21b6c6d4..deafe19c34b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -929,7 +929,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, /** - * convert_extent - convert all bits in a given range from one bit to another + * convert_extent_bit - convert all bits in a given range from one bit to + * another * @tree: the io tree to search * @start: the start offset in bytes * @end: the end offset in bytes (inclusive) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a7d1921ac76..fb8d671d00e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4247,7 +4247,7 @@ static void btrfs_dentry_release(struct dentry *dentry) } static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct dentry *ret; @@ -4893,7 +4893,7 @@ out_unlock: } static int btrfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, struct nameidata *nd) + umode_t mode, bool excl) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -6987,7 +6987,7 @@ void btrfs_destroy_inode(struct inode *inode) struct btrfs_ordered_extent *ordered; struct btrfs_root *root = BTRFS_I(inode)->root; - WARN_ON(!list_empty(&inode->i_dentry)); + WARN_ON(!hlist_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0e92e576300..1e9f6c019ad 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3268,7 +3268,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; @@ -3338,7 +3338,7 @@ out_bargs: out: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e23991574fd..b19d7556772 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1068,7 +1068,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC, + fs_info); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; @@ -1082,7 +1083,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7f0771d3894..b0b5f7cdfff 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -567,7 +567,7 @@ lookup_again: if (ret < 0) goto create_error; start = jiffies; - ret = vfs_create(dir->d_inode, next, S_IFREG, NULL); + ret = vfs_create(dir->d_inode, next, S_IFREG, true); cachefiles_hist(cachefiles_create_histogram, start); if (ret < 0) goto create_error; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 0e3c0924cc3..c0353dfac51 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -891,6 +891,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) struct cachefiles_cache *cache; mm_segment_t old_fs; struct file *file; + struct path path; loff_t pos, eof; size_t len; void *data; @@ -916,10 +917,9 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) /* write the page to the backing filesystem and let it store it in its * own time */ - dget(object->backer); - mntget(cache->mnt); - file = dentry_open(object->backer, cache->mnt, O_RDWR, - cache->cache_cred); + path.mnt = cache->mnt; + path.dentry = object->backer; + file = dentry_open(&path, O_RDWR, cache->cache_cred); if (IS_ERR(file)) { ret = PTR_ERR(file); } else { diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3e8094be460..00894ff9246 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -576,7 +576,7 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) * the MDS so that it gets our 'caps wanted' value in a single op. */ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -594,14 +594,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, if (err < 0) return ERR_PTR(err); - /* open (but not create!) intent? */ - if (nd && - (nd->flags & LOOKUP_OPEN) && - !(nd->intent.open.flags & O_CREAT)) { - int mode = nd->intent.open.create_mode & ~current->fs->umask; - return ceph_lookup_open(dir, dentry, nd, mode, 1); - } - /* can we conclude ENOENT locally? */ if (dentry->d_inode == NULL) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -642,13 +634,51 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, return dentry; } +int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened) +{ + int err; + struct dentry *res = NULL; + + if (!(flags & O_CREAT)) { + if (dentry->d_name.len > NAME_MAX) + return -ENAMETOOLONG; + + err = ceph_init_dentry(dentry); + if (err < 0) + return err; + + return ceph_lookup_open(dir, dentry, file, flags, mode, opened); + } + + if (d_unhashed(dentry)) { + res = ceph_lookup(dir, dentry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + dentry = res; + } + + /* We don't deal with positive dentries here */ + if (dentry->d_inode) + return finish_no_open(file, res); + + *opened |= FILE_CREATED; + err = ceph_lookup_open(dir, dentry, file, flags, mode, opened); + dput(res); + + return err; +} + /* * If we do a create but get no trace back from the MDS, follow up with * a lookup (the VFS expects us to link up the provided dentry). */ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) { - struct dentry *result = ceph_lookup(dir, dentry, NULL); + struct dentry *result = ceph_lookup(dir, dentry, 0); if (result && !IS_ERR(result)) { /* @@ -700,25 +730,9 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, } static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { - dout("create in dir %p dentry %p name '%.*s'\n", - dir, dentry, dentry->d_name.len, dentry->d_name.name); - - if (ceph_snap(dir) != CEPH_NOSNAP) - return -EROFS; - - if (nd) { - BUG_ON((nd->flags & LOOKUP_OPEN) == 0); - dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); - /* hrm, what should i do here if we get aliased? */ - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - return 0; - } - - /* fall back to mknod */ - return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); + return ceph_mknod(dir, dentry, mode, 0); } static int ceph_symlink(struct inode *dir, struct dentry *dentry, @@ -1028,12 +1042,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) /* * Check if cached dentry can be trusted. */ -static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) { int valid = 0; struct inode *dir; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, @@ -1080,7 +1094,7 @@ static void ceph_d_release(struct dentry *dentry) } static int ceph_snapdir_d_revalidate(struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { /* * Eventually, we'll want to revalidate snapped metadata @@ -1357,6 +1371,7 @@ const struct inode_operations ceph_dir_iops = { .rmdir = ceph_unlink, .rename = ceph_rename, .create = ceph_create, + .atomic_open = ceph_atomic_open, }; const struct dentry_operations ceph_dentry_ops = { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 988d4f302e4..1b81d6c3187 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -213,22 +213,15 @@ out: * may_open() fails, the struct *file gets cleaned up (i.e. * ceph_release gets called). So fear not! */ -/* - * flags - * path_lookup_open -> LOOKUP_OPEN - * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE - */ -struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, - struct nameidata *nd, int mode, - int locked_dir) +int ceph_lookup_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct file *file; struct ceph_mds_request *req; struct dentry *ret; int err; - int flags = nd->intent.open.flags; dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n", dentry, dentry->d_name.len, dentry->d_name.name, flags, mode); @@ -236,7 +229,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) - return ERR_CAST(req); + return PTR_ERR(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; if (flags & O_CREAT) { @@ -254,14 +247,17 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, err = ceph_handle_notrace_create(dir, dentry); if (err) goto out; - file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open); - if (IS_ERR(file)) - err = PTR_ERR(file); + err = finish_open(file, req->r_dentry, ceph_open, opened); out: ret = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); dout("ceph_lookup_open result=%p\n", ret); - return ret; + + if (IS_ERR(ret)) + return PTR_ERR(ret); + + dput(ret); + return err; } int ceph_release(struct inode *inode, struct file *file) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 1e67dd7305a..7076109f014 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -871,7 +871,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, fsc); + sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc); if (IS_ERR(sb)) { res = ERR_CAST(sb); goto out; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fc35036d258..f4d5522cb61 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -806,9 +806,9 @@ extern int ceph_copy_from_page_vector(struct page **pages, loff_t off, size_t len); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_open(struct inode *inode, struct file *file); -extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, - struct nameidata *nd, int mode, - int locked_dir); +extern int ceph_lookup_open(struct inode *dir, struct dentry *dentry, + struct file *od, unsigned flags, + umode_t mode, int *opened); extern int ceph_release(struct inode *inode, struct file *filp); /* dir.c */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8b6e344eb0b..a7610cfedf0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -257,7 +257,6 @@ cifs_alloc_inode(struct super_block *sb) static void cifs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); } @@ -638,7 +637,10 @@ cifs_do_mount(struct file_system_type *fs_type, mnt_data.cifs_sb = cifs_sb; mnt_data.flags = flags; - sb = sget(fs_type, cifs_match_super, cifs_set_super, &mnt_data); + /* BB should we make this contingent on mount parm? */ + flags |= MS_NODIRATIME | MS_NOATIME; + + sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data); if (IS_ERR(sb)) { root = ERR_CAST(sb); cifs_umount(cifs_sb); @@ -649,10 +651,6 @@ cifs_do_mount(struct file_system_type *fs_type, cFYI(1, "Use existing superblock"); cifs_umount(cifs_sb); } else { - sb->s_flags = flags; - /* BB should we make this contingent on mount parm? */ - sb->s_flags |= MS_NODIRATIME | MS_NOATIME; - rc = cifs_read_super(sb); if (rc) { root = ERR_PTR(rc); @@ -778,6 +776,7 @@ struct file_system_type cifs_fs_type = { }; const struct inode_operations cifs_dir_inode_ops = { .create = cifs_create, + .atomic_open = cifs_atomic_open, .lookup = cifs_lookup, .getattr = cifs_getattr, .unlink = cifs_unlink, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 65365358c97..1c49c5a9b27 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -45,9 +45,12 @@ extern const struct address_space_operations cifs_addr_ops_smallbuf; extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); extern int cifs_create(struct inode *, struct dentry *, umode_t, - struct nameidata *); + bool excl); +extern int cifs_atomic_open(struct inode *, struct dentry *, + struct file *, unsigned, umode_t, + int *); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, - struct nameidata *); + unsigned int); extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5b400730c21..4ee522b3f66 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -86,7 +86,31 @@ static struct { #endif /* CONFIG_CIFS_WEAK_PW_HASH */ #endif /* CIFS_POSIX */ -/* Forward declarations */ +#ifdef CONFIG_HIGHMEM +/* + * On arches that have high memory, kmap address space is limited. By + * serializing the kmap operations on those arches, we ensure that we don't + * end up with a bunch of threads in writeback with partially mapped page + * arrays, stuck waiting for kmap to come back. That situation prevents + * progress and can deadlock. + */ +static DEFINE_MUTEX(cifs_kmap_mutex); + +static inline void +cifs_kmap_lock(void) +{ + mutex_lock(&cifs_kmap_mutex); +} + +static inline void +cifs_kmap_unlock(void) +{ + mutex_unlock(&cifs_kmap_mutex); +} +#else /* !CONFIG_HIGHMEM */ +#define cifs_kmap_lock() do { ; } while(0) +#define cifs_kmap_unlock() do { ; } while(0) +#endif /* CONFIG_HIGHMEM */ /* Mark as invalid, all open files on tree connections since they were closed when session to server was lost */ @@ -1503,7 +1527,9 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) } /* marshal up the page array */ + cifs_kmap_lock(); len = rdata->marshal_iov(rdata, data_len); + cifs_kmap_unlock(); data_len -= len; /* issue the read if we have any iovecs left to fill */ @@ -2069,7 +2095,9 @@ cifs_async_writev(struct cifs_writedata *wdata) * and set the iov_len properly for each one. It may also set * wdata->bytes too. */ + cifs_kmap_lock(); wdata->marshal_iov(iov, wdata); + cifs_kmap_unlock(); cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0ae86ddf221..94b7788c318 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3445,6 +3445,18 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, #define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) #define CIFS_DEFAULT_NON_POSIX_WSIZE (65536) +/* + * On hosts with high memory, we can't currently support wsize/rsize that are + * larger than we can kmap at once. Cap the rsize/wsize at + * LAST_PKMAP * PAGE_SIZE. We'll never be able to fill a read or write request + * larger than that anyway. + */ +#ifdef CONFIG_HIGHMEM +#define CIFS_KMAP_SIZE_LIMIT (LAST_PKMAP * PAGE_CACHE_SIZE) +#else /* CONFIG_HIGHMEM */ +#define CIFS_KMAP_SIZE_LIMIT (1<<24) +#endif /* CONFIG_HIGHMEM */ + static unsigned int cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) { @@ -3475,6 +3487,9 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) wsize = min_t(unsigned int, wsize, server->maxBuf - sizeof(WRITE_REQ) + 4); + /* limit to the amount that we can kmap at once */ + wsize = min_t(unsigned int, wsize, CIFS_KMAP_SIZE_LIMIT); + /* hard limit of CIFS_MAX_WSIZE */ wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE); @@ -3516,6 +3531,9 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) if (!(server->capabilities & CAP_LARGE_READ_X)) rsize = min_t(unsigned int, CIFSMaxBufSize, rsize); + /* limit to the amount that we can kmap at once */ + rsize = min_t(unsigned int, rsize, CIFS_KMAP_SIZE_LIMIT); + /* hard limit of CIFS_MAX_RSIZE */ rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index ec4e9a2a12f..a180265a10b 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -133,108 +133,141 @@ cifs_bp_rename_retry: return full_path; } +/* + * Don't allow the separator character in a path component. + * The VFS will not allow "/", but "\" is allowed by posix. + */ +static int +check_name(struct dentry *direntry) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); + int i; + + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { + for (i = 0; i < direntry->d_name.len; i++) { + if (direntry->d_name.name[i] == '\\') { + cFYI(1, "Invalid file name"); + return -EINVAL; + } + } + } + return 0; +} + + /* Inode operations in similar order to how they appear in Linux file fs.h */ -int -cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, - struct nameidata *nd) +static int cifs_do_create(struct inode *inode, struct dentry *direntry, + int xid, struct tcon_link *tlink, unsigned oflags, + umode_t mode, __u32 *oplock, __u16 *fileHandle, + int *created) { int rc = -ENOENT; - int xid; int create_options = CREATE_NOT_DIR; - __u32 oplock = 0; - int oflags; - /* - * BB below access is probably too much for mknod to request - * but we have to do query and setpathinfo so requesting - * less could fail (unless we want to request getatr and setatr - * permissions (only). At least for POSIX we do not have to - * request so much. - */ - int desiredAccess = GENERIC_READ | GENERIC_WRITE; - __u16 fileHandle; - struct cifs_sb_info *cifs_sb; - struct tcon_link *tlink; - struct cifs_tcon *tcon; + int desiredAccess; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = tlink_tcon(tlink); char *full_path = NULL; FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; - int disposition = FILE_OVERWRITE_IF; - - xid = GetXid(); - - cifs_sb = CIFS_SB(inode->i_sb); - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - FreeXid(xid); - return PTR_ERR(tlink); - } - tcon = tlink_tcon(tlink); + int disposition; + *oplock = 0; if (tcon->ses->server->oplocks) - oplock = REQ_OPLOCK; - - if (nd) - oflags = nd->intent.open.file->f_flags; - else - oflags = O_RDONLY | O_CREAT; + *oplock = REQ_OPLOCK; full_path = build_path_from_dentry(direntry); if (full_path == NULL) { rc = -ENOMEM; - goto cifs_create_out; + goto out; } if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && + !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = cifs_posix_open(full_path, &newinode, - inode->i_sb, mode, oflags, &oplock, &fileHandle, xid); - /* EIO could indicate that (posix open) operation is not - supported, despite what server claimed in capability - negotiation. EREMOTE indicates DFS junction, which is not - handled in posix open */ - - if (rc == 0) { - if (newinode == NULL) /* query inode info */ + inode->i_sb, mode, oflags, oplock, fileHandle, xid); + switch (rc) { + case 0: + if (newinode == NULL) { + /* query inode info */ goto cifs_create_get_file_info; - else /* success, no need to query */ - goto cifs_create_set_dentry; - } else if ((rc != -EIO) && (rc != -EREMOTE) && - (rc != -EOPNOTSUPP) && (rc != -EINVAL)) - goto cifs_create_out; - /* else fallthrough to retry, using older open call, this is - case where server does not support this SMB level, and - falsely claims capability (also get here for DFS case - which should be rare for path not covered on files) */ - } + } + + if (!S_ISREG(newinode->i_mode)) { + /* + * The server may allow us to open things like + * FIFOs, but the client isn't set up to deal + * with that. If it's not a regular file, just + * close it and proceed as if it were a normal + * lookup. + */ + CIFSSMBClose(xid, tcon, *fileHandle); + goto cifs_create_get_file_info; + } + /* success, no need to query */ + goto cifs_create_set_dentry; + + case -ENOENT: + goto cifs_create_get_file_info; + + case -EIO: + case -EINVAL: + /* + * EIO could indicate that (posix open) operation is not + * supported, despite what server claimed in capability + * negotiation. + * + * POSIX open in samba versions 3.3.1 and earlier could + * incorrectly fail with invalid parameter. + */ + tcon->broken_posix_open = true; + break; + + case -EREMOTE: + case -EOPNOTSUPP: + /* + * EREMOTE indicates DFS junction, which is not handled + * in posix open. If either that or op not supported + * returned, follow the normal lookup. + */ + break; - if (nd) { - /* if the file is going to stay open, then we - need to set the desired access properly */ - desiredAccess = 0; - if (OPEN_FMODE(oflags) & FMODE_READ) - desiredAccess |= GENERIC_READ; /* is this too little? */ - if (OPEN_FMODE(oflags) & FMODE_WRITE) - desiredAccess |= GENERIC_WRITE; - - if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) - disposition = FILE_CREATE; - else if ((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) - disposition = FILE_OVERWRITE_IF; - else if ((oflags & O_CREAT) == O_CREAT) - disposition = FILE_OPEN_IF; - else - cFYI(1, "Create flag not set in create function"); + default: + goto out; + } + /* + * fallthrough to retry, using older open call, this is case + * where server does not support this SMB level, and falsely + * claims capability (also get here for DFS case which should be + * rare for path not covered on files) + */ } + desiredAccess = 0; + if (OPEN_FMODE(oflags) & FMODE_READ) + desiredAccess |= GENERIC_READ; /* is this too little? */ + if (OPEN_FMODE(oflags) & FMODE_WRITE) + desiredAccess |= GENERIC_WRITE; + + disposition = FILE_OVERWRITE_IF; + if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + disposition = FILE_CREATE; + else if ((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC)) + disposition = FILE_OVERWRITE_IF; + else if ((oflags & O_CREAT) == O_CREAT) + disposition = FILE_OPEN_IF; + else + cFYI(1, "Create flag not set in create function"); + /* BB add processing to set equivalent of mode - e.g. via CreateX with ACLs */ buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { rc = -ENOMEM; - goto cifs_create_out; + goto out; } /* @@ -250,7 +283,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, if (tcon->ses->capabilities & CAP_NT_SMBS) rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, create_options, - &fileHandle, &oplock, buf, cifs_sb->local_nls, + fileHandle, oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); else rc = -EIO; /* no NT SMB support fall into legacy open below */ @@ -259,17 +292,17 @@ cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, /* old server, retry the open legacy style */ rc = SMBLegacyOpen(xid, tcon, full_path, disposition, desiredAccess, create_options, - &fileHandle, &oplock, buf, cifs_sb->local_nls, + fileHandle, oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); } if (rc) { cFYI(1, "cifs_create returned 0x%x", rc); - goto cifs_create_out; + goto out; } /* If Open reported that we actually created a file then we now have to set the mode if possible */ - if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { + if ((tcon->unix_ext) && (*oplock & CIFS_CREATE_ACTION)) { struct cifs_unix_set_info_args args = { .mode = mode, .ctime = NO_CHANGE_64, @@ -278,6 +311,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, .device = 0, }; + *created |= FILE_CREATED; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { args.uid = (__u64) current_fsuid(); if (inode->i_mode & S_ISGID) @@ -288,7 +322,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, args.uid = NO_CHANGE_64; args.gid = NO_CHANGE_64; } - CIFSSMBUnixSetFileInfo(xid, tcon, &args, fileHandle, + CIFSSMBUnixSetFileInfo(xid, tcon, &args, *fileHandle, current->tgid); } else { /* BB implement mode setting via Windows security @@ -305,11 +339,11 @@ cifs_create_get_file_info: inode->i_sb, xid); else { rc = cifs_get_inode_info(&newinode, full_path, buf, - inode->i_sb, xid, &fileHandle); + inode->i_sb, xid, fileHandle); if (newinode) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) newinode->i_mode = mode; - if ((oplock & CIFS_CREATE_ACTION) && + if ((*oplock & CIFS_CREATE_ACTION) && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { newinode->i_uid = current_fsuid(); if (inode->i_mode & S_ISGID) @@ -321,40 +355,139 @@ cifs_create_get_file_info: } cifs_create_set_dentry: - if (rc == 0) - d_instantiate(direntry, newinode); - else + if (rc != 0) { cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); + goto out; + } + d_drop(direntry); + d_add(direntry, newinode); - if (newinode && nd) { - struct cifsFileInfo *pfile_info; - struct file *filp; + /* ENOENT for create? How weird... */ + rc = -ENOENT; + if (!newinode) { + CIFSSMBClose(xid, tcon, *fileHandle); + goto out; + } + rc = 0; - filp = lookup_instantiate_filp(nd, direntry, generic_file_open); - if (IS_ERR(filp)) { - rc = PTR_ERR(filp); - CIFSSMBClose(xid, tcon, fileHandle); - goto cifs_create_out; - } +out: + kfree(buf); + kfree(full_path); + return rc; +} - pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock); - if (pfile_info == NULL) { - fput(filp); - CIFSSMBClose(xid, tcon, fileHandle); - rc = -ENOMEM; - } - } else { +int +cifs_atomic_open(struct inode *inode, struct dentry *direntry, + struct file *file, unsigned oflags, umode_t mode, + int *opened) +{ + int rc; + int xid; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + __u16 fileHandle; + __u32 oplock; + struct file *filp; + struct cifsFileInfo *pfile_info; + + /* Posix open is only called (at lookup time) for file create now. For + * opens (rather than creates), because we do not know if it is a file + * or directory yet, and current Samba no longer allows us to do posix + * open on dirs, we could end up wasting an open call on what turns out + * to be a dir. For file opens, we wait to call posix open till + * cifs_open. It could be added to atomic_open in the future but the + * performance tradeoff of the extra network request when EISDIR or + * EACCES is returned would have to be weighed against the 50% reduction + * in network traffic in the other paths. + */ + if (!(oflags & O_CREAT)) { + struct dentry *res = cifs_lookup(inode, direntry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + return finish_no_open(file, res); + } + + rc = check_name(direntry); + if (rc) + return rc; + + xid = GetXid(); + + cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p", + inode, direntry->d_name.name, direntry); + + tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); + filp = ERR_CAST(tlink); + if (IS_ERR(tlink)) + goto free_xid; + + tcon = tlink_tcon(tlink); + + rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, + &oplock, &fileHandle, opened); + + if (rc) + goto out; + + rc = finish_open(file, direntry, generic_file_open, opened); + if (rc) { CIFSSMBClose(xid, tcon, fileHandle); + goto out; } -cifs_create_out: - kfree(buf); - kfree(full_path); + pfile_info = cifs_new_fileinfo(fileHandle, filp, tlink, oplock); + if (pfile_info == NULL) { + CIFSSMBClose(xid, tcon, fileHandle); + fput(filp); + rc = -ENOMEM; + } + +out: cifs_put_tlink(tlink); +free_xid: FreeXid(xid); return rc; } +int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, + bool excl) +{ + int rc; + int xid = GetXid(); + /* + * BB below access is probably too much for mknod to request + * but we have to do query and setpathinfo so requesting + * less could fail (unless we want to request getatr and setatr + * permissions (only). At least for POSIX we do not have to + * request so much. + */ + unsigned oflags = O_EXCL | O_CREAT | O_RDWR; + struct tcon_link *tlink; + __u16 fileHandle; + __u32 oplock; + int created = FILE_CREATED; + + cFYI(1, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p", + inode, direntry->d_name.name, direntry); + + tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); + rc = PTR_ERR(tlink); + if (IS_ERR(tlink)) + goto free_xid; + + rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, + &oplock, &fileHandle, &created); + if (!rc) + CIFSSMBClose(xid, tlink_tcon(tlink), fileHandle); + + cifs_put_tlink(tlink); +free_xid: + FreeXid(xid); + + return rc; +} + int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, dev_t device_number) { @@ -488,20 +621,15 @@ mknod_out: struct dentry * cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, - struct nameidata *nd) + unsigned int flags) { int xid; int rc = 0; /* to get around spurious gcc warning, set to zero here */ - __u32 oplock; - __u16 fileHandle = 0; - bool posix_open = false; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *pTcon; - struct cifsFileInfo *cfile; struct inode *newInode = NULL; char *full_path = NULL; - struct file *filp; xid = GetXid(); @@ -518,31 +646,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } pTcon = tlink_tcon(tlink); - oplock = pTcon->ses->server->oplocks ? REQ_OPLOCK : 0; - - /* - * Don't allow the separator character in a path component. - * The VFS will not allow "/", but "\" is allowed by posix. - */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { - int i; - for (i = 0; i < direntry->d_name.len; i++) - if (direntry->d_name.name[i] == '\\') { - cFYI(1, "Invalid file name"); - rc = -EINVAL; - goto lookup_out; - } - } - - /* - * O_EXCL: optimize away the lookup, but don't hash the dentry. Let - * the VFS handle the create. - */ - if (nd && (nd->flags & LOOKUP_EXCL)) { - d_instantiate(direntry, NULL); - rc = 0; + rc = check_name(direntry); + if (rc) goto lookup_out; - } /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) @@ -560,80 +666,16 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode); - /* Posix open is only called (at lookup time) for file create now. - * For opens (rather than creates), because we do not know if it - * is a file or directory yet, and current Samba no longer allows - * us to do posix open on dirs, we could end up wasting an open call - * on what turns out to be a dir. For file opens, we wait to call posix - * open till cifs_open. It could be added here (lookup) in the future - * but the performance tradeoff of the extra network request when EISDIR - * or EACCES is returned would have to be weighed against the 50% - * reduction in network traffic in the other paths. - */ if (pTcon->unix_ext) { - if (nd && !(nd->flags & LOOKUP_DIRECTORY) && - (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && - (nd->intent.open.file->f_flags & O_CREAT)) { - rc = cifs_posix_open(full_path, &newInode, - parent_dir_inode->i_sb, - nd->intent.open.create_mode, - nd->intent.open.file->f_flags, &oplock, - &fileHandle, xid); - /* - * The check below works around a bug in POSIX - * open in samba versions 3.3.1 and earlier where - * open could incorrectly fail with invalid parameter. - * If either that or op not supported returned, follow - * the normal lookup. - */ - switch (rc) { - case 0: - /* - * The server may allow us to open things like - * FIFOs, but the client isn't set up to deal - * with that. If it's not a regular file, just - * close it and proceed as if it were a normal - * lookup. - */ - if (newInode && !S_ISREG(newInode->i_mode)) { - CIFSSMBClose(xid, pTcon, fileHandle); - break; - } - case -ENOENT: - posix_open = true; - case -EOPNOTSUPP: - break; - default: - pTcon->broken_posix_open = true; - } - } - if (!posix_open) - rc = cifs_get_inode_info_unix(&newInode, full_path, - parent_dir_inode->i_sb, xid); - } else + rc = cifs_get_inode_info_unix(&newInode, full_path, + parent_dir_inode->i_sb, xid); + } else { rc = cifs_get_inode_info(&newInode, full_path, NULL, parent_dir_inode->i_sb, xid, NULL); + } if ((rc == 0) && (newInode != NULL)) { d_add(direntry, newInode); - if (posix_open) { - filp = lookup_instantiate_filp(nd, direntry, - generic_file_open); - if (IS_ERR(filp)) { - rc = PTR_ERR(filp); - CIFSSMBClose(xid, pTcon, fileHandle); - goto lookup_out; - } - - cfile = cifs_new_fileinfo(fileHandle, filp, tlink, - oplock); - if (cfile == NULL) { - fput(filp); - CIFSSMBClose(xid, pTcon, fileHandle); - rc = -ENOMEM; - goto lookup_out; - } - } /* since paths are not looked up by component - the parent directories are presumed to be good here */ renew_parental_timestamps(direntry); @@ -658,9 +700,9 @@ lookup_out: } static int -cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) +cifs_d_revalidate(struct dentry *direntry, unsigned int flags) { - if (nd && (nd->flags & LOOKUP_RCU)) + if (flags & LOOKUP_RCU) return -ECHILD; if (direntry->d_inode) { @@ -689,7 +731,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) * This may be nfsd (or something), anyway, we can't see the * intent of this. So, since this can be for creation, drop it. */ - if (!nd) + if (!flags) return 0; /* @@ -697,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) * case sensitive name which is specified by user if this is * for creation. */ - if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 745da3d0653..8e8bb49112f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -800,7 +800,7 @@ cifs_find_inode(struct inode *inode, void *opaque) return 0; /* if it's not a directory or has no dentries, then flag it */ - if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) + if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; return 1; @@ -825,9 +825,10 @@ static bool inode_has_hashed_dentries(struct inode *inode) { struct dentry *dentry; + struct hlist_node *p; spin_lock(&inode->i_lock); - list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { if (!d_unhashed(dentry) || IS_ROOT(dentry)) { spin_unlock(&inode->i_lock); return true; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 0a8224d1c4c..a4217f02fab 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -86,9 +86,12 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, dentry = d_lookup(parent, name); if (dentry) { - /* FIXME: check for inode number changes? */ - if (dentry->d_inode != NULL) + inode = dentry->d_inode; + /* update inode in place if i_ino didn't change */ + if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { + cifs_fattr_to_inode(inode, fattr); return dentry; + } d_drop(dentry); dput(dentry); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 3097ee58fd7..f25d4ea14be 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -365,16 +365,14 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov, if (mid == NULL) return -ENOMEM; - /* put it on the pending_mid_q */ - spin_lock(&GlobalMid_Lock); - list_add_tail(&mid->qhead, &server->pending_mid_q); - spin_unlock(&GlobalMid_Lock); - rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number); - if (rc) - delete_mid(mid); + if (rc) { + DeleteMidQEntry(mid); + return rc; + } + *ret_mid = mid; - return rc; + return 0; } /* @@ -407,17 +405,21 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, mid->callback_data = cbdata; mid->mid_state = MID_REQUEST_SUBMITTED; + /* put it on the pending_mid_q */ + spin_lock(&GlobalMid_Lock); + list_add_tail(&mid->qhead, &server->pending_mid_q); + spin_unlock(&GlobalMid_Lock); + + cifs_in_send_inc(server); rc = smb_sendv(server, iov, nvec); cifs_in_send_dec(server); cifs_save_when_sent(mid); mutex_unlock(&server->srv_mutex); - if (rc) - goto out_err; + if (rc == 0) + return 0; - return rc; -out_err: delete_mid(mid); add_credits(server, 1); wake_up(&server->request_q); diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 69015787618..958ae0e0ff8 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -89,17 +89,13 @@ int coda_cache_check(struct inode *inode, int mask) /* this won't do any harm: just flag all children */ static void coda_flag_children(struct dentry *parent, int flag) { - struct list_head *child; struct dentry *de; spin_lock(&parent->d_lock); - list_for_each(child, &parent->d_subdirs) - { - de = list_entry(child, struct dentry, d_u.d_child); + list_for_each_entry(de, &parent->d_subdirs, d_u.d_child) { /* don't know what to do with negative dentries */ - if ( ! de->d_inode ) - continue; - coda_flag_inode(de->d_inode, flag); + if (de->d_inode ) + coda_flag_inode(de->d_inode, flag); } spin_unlock(&parent->d_lock); return; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 17751582906..49fe52d2560 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -30,8 +30,8 @@ #include "coda_int.h" /* dir inode-ops */ -static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, struct nameidata *nd); -static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd); +static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, bool excl); +static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags); static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, struct dentry *entry); static int coda_unlink(struct inode *dir_inode, struct dentry *entry); @@ -46,7 +46,7 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, static int coda_readdir(struct file *file, void *buf, filldir_t filldir); /* dentry ops */ -static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd); +static int coda_dentry_revalidate(struct dentry *de, unsigned int flags); static int coda_dentry_delete(const struct dentry *); /* support routines */ @@ -94,7 +94,7 @@ const struct file_operations coda_dir_operations = { /* inode operations for directories */ /* access routines: lookup, readlink, permission */ -static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd) +static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { struct super_block *sb = dir->i_sb; const char *name = entry->d_name.name; @@ -188,7 +188,7 @@ static inline void coda_dir_drop_nlink(struct inode *dir) } /* creation routines: create, mknod, mkdir, link, symlink */ -static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, struct nameidata *nd) +static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool excl) { int error; const char *name=de->d_name.name; @@ -536,12 +536,12 @@ out: } /* called when a cache lookup succeeds */ -static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd) +static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) { struct inode *inode; struct coda_inode_info *cii; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; inode = de->d_inode; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7e6c52d8a20..7414ae24a79 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -442,7 +442,7 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den static struct dentry * configfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; struct configfs_dirent * sd; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index d013c46402e..28cca01ca9c 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -417,7 +417,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* * Lookup and fill in the inode data.. */ -static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned int offset = 0; struct inode *inode = NULL; diff --git a/fs/dcache.c b/fs/dcache.c index 40469044088..8086636bf79 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -218,7 +218,7 @@ static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - WARN_ON(!list_empty(&dentry->d_alias)); + WARN_ON(!hlist_unhashed(&dentry->d_alias)); if (dname_external(dentry)) kfree(dentry->d_name.name); kmem_cache_free(dentry_cache, dentry); @@ -267,7 +267,7 @@ static void dentry_iput(struct dentry * dentry) struct inode *inode = dentry->d_inode; if (inode) { dentry->d_inode = NULL; - list_del_init(&dentry->d_alias); + hlist_del_init(&dentry->d_alias); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -291,7 +291,7 @@ static void dentry_unlink_inode(struct dentry * dentry) { struct inode *inode = dentry->d_inode; dentry->d_inode = NULL; - list_del_init(&dentry->d_alias); + hlist_del_init(&dentry->d_alias); dentry_rcuwalk_barrier(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); @@ -699,10 +699,11 @@ EXPORT_SYMBOL(dget_parent); static struct dentry *__d_find_alias(struct inode *inode, int want_discon) { struct dentry *alias, *discon_alias; + struct hlist_node *p; again: discon_alias = NULL; - list_for_each_entry(alias, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { if (IS_ROOT(alias) && @@ -737,7 +738,7 @@ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; - if (!list_empty(&inode->i_dentry)) { + if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode, 0); spin_unlock(&inode->i_lock); @@ -753,9 +754,10 @@ EXPORT_SYMBOL(d_find_alias); void d_prune_aliases(struct inode *inode) { struct dentry *dentry; + struct hlist_node *p; restart: spin_lock(&inode->i_lock); - list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_count) { __dget_dlock(dentry); @@ -977,7 +979,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) inode = dentry->d_inode; if (inode) { dentry->d_inode = NULL; - list_del_init(&dentry->d_alias); + hlist_del_init(&dentry->d_alias); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else @@ -1312,7 +1314,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); - INIT_LIST_HEAD(&dentry->d_alias); + INIT_HLIST_NODE(&dentry->d_alias); INIT_LIST_HEAD(&dentry->d_u.d_child); d_set_d_op(dentry, dentry->d_sb->s_d_op); @@ -1400,7 +1402,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) if (inode) { if (unlikely(IS_AUTOMOUNT(inode))) dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; - list_add(&dentry->d_alias, &inode->i_dentry); + hlist_add_head(&dentry->d_alias, &inode->i_dentry); } dentry->d_inode = inode; dentry_rcuwalk_barrier(dentry); @@ -1425,7 +1427,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { - BUG_ON(!list_empty(&entry->d_alias)); + BUG_ON(!hlist_unhashed(&entry->d_alias)); if (inode) spin_lock(&inode->i_lock); __d_instantiate(entry, inode); @@ -1458,13 +1460,14 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry, int len = entry->d_name.len; const char *name = entry->d_name.name; unsigned int hash = entry->d_name.hash; + struct hlist_node *p; if (!inode) { __d_instantiate(entry, NULL); return NULL; } - list_for_each_entry(alias, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or @@ -1490,7 +1493,7 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) { struct dentry *result; - BUG_ON(!list_empty(&entry->d_alias)); + BUG_ON(!hlist_unhashed(&entry->d_alias)); if (inode) spin_lock(&inode->i_lock); @@ -1531,9 +1534,9 @@ static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; - if (list_empty(&inode->i_dentry)) + if (hlist_empty(&inode->i_dentry)) return NULL; - alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); __dget(alias); return alias; } @@ -1607,7 +1610,7 @@ struct dentry *d_obtain_alias(struct inode *inode) spin_lock(&tmp->d_lock); tmp->d_inode = inode; tmp->d_flags |= DCACHE_DISCONNECTED; - list_add(&tmp->d_alias, &inode->i_dentry); + hlist_add_head(&tmp->d_alias, &inode->i_dentry); hlist_bl_lock(&tmp->d_sb->s_anon); hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); hlist_bl_unlock(&tmp->d_sb->s_anon); @@ -2384,14 +2387,13 @@ static struct dentry *__d_unalias(struct inode *inode, struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL, *m2 = NULL; - struct dentry *ret; + struct dentry *ret = ERR_PTR(-EBUSY); /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ - ret = ERR_PTR(-EBUSY); if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; @@ -2399,8 +2401,10 @@ static struct dentry *__d_unalias(struct inode *inode, goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: - __d_move(alias, dentry); - ret = alias; + if (likely(!d_mountpoint(alias))) { + __d_move(alias, dentry); + ret = alias; + } out_err: spin_unlock(&inode->i_lock); if (m2) @@ -2622,7 +2626,7 @@ global_root: if (!slash) error = prepend(buffer, buflen, "/", 1); if (!error) - error = real_mount(vfsmnt)->mnt_ns ? 1 : 2; + error = is_mounted(vfsmnt) ? 1 : 2; goto out; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 0de5e26870c..4733eab34a2 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -54,13 +54,12 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev break; case S_IFLNK: inode->i_op = &debugfs_link_operations; - inode->i_fop = fops; inode->i_private = data; break; case S_IFDIR: inode->i_op = &simple_dir_inode_operations; - inode->i_fop = fops ? fops : &simple_dir_operations; - inode->i_private = data; + inode->i_fop = &simple_dir_operations; + inode->i_private = NULL; /* directory inodes start off with i_nlink == 2 * (for "." entry) */ @@ -91,13 +90,12 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, return error; } -static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, - void *data, const struct file_operations *fops) +static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int res; mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; - res = debugfs_mknod(dir, dentry, mode, 0, data, fops); + res = debugfs_mknod(dir, dentry, mode, 0, NULL, NULL); if (!res) { inc_nlink(dir); fsnotify_mkdir(dir, dentry); @@ -106,10 +104,10 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, } static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode, - void *data, const struct file_operations *fops) + void *data) { mode = (mode & S_IALLUGO) | S_IFLNK; - return debugfs_mknod(dir, dentry, mode, 0, data, fops); + return debugfs_mknod(dir, dentry, mode, 0, data, NULL); } static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -293,13 +291,19 @@ static struct file_system_type debug_fs_type = { .kill_sb = kill_litter_super, }; -static int debugfs_create_by_name(const char *name, umode_t mode, - struct dentry *parent, - struct dentry **dentry, - void *data, - const struct file_operations *fops) +struct dentry *__create_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops) { - int error = 0; + struct dentry *dentry = NULL; + int error; + + pr_debug("debugfs: creating file '%s'\n",name); + + error = simple_pin_fs(&debug_fs_type, &debugfs_mount, + &debugfs_mount_count); + if (error) + goto exit; /* If the parent is not specified, we create it in the root. * We need the root dentry to do this, which is in the super @@ -309,30 +313,35 @@ static int debugfs_create_by_name(const char *name, umode_t mode, if (!parent) parent = debugfs_mount->mnt_root; - *dentry = NULL; + dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); - *dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(*dentry)) { + dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(dentry)) { switch (mode & S_IFMT) { case S_IFDIR: - error = debugfs_mkdir(parent->d_inode, *dentry, mode, - data, fops); + error = debugfs_mkdir(parent->d_inode, dentry, mode); + break; case S_IFLNK: - error = debugfs_link(parent->d_inode, *dentry, mode, - data, fops); + error = debugfs_link(parent->d_inode, dentry, mode, + data); break; default: - error = debugfs_create(parent->d_inode, *dentry, mode, + error = debugfs_create(parent->d_inode, dentry, mode, data, fops); break; } - dput(*dentry); + dput(dentry); } else - error = PTR_ERR(*dentry); + error = PTR_ERR(dentry); mutex_unlock(&parent->d_inode->i_mutex); - return error; + if (error) { + dentry = NULL; + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + } +exit: + return dentry; } /** @@ -365,25 +374,15 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { - struct dentry *dentry = NULL; - int error; - - pr_debug("debugfs: creating file '%s'\n",name); - - error = simple_pin_fs(&debug_fs_type, &debugfs_mount, - &debugfs_mount_count); - if (error) - goto exit; - - error = debugfs_create_by_name(name, mode, parent, &dentry, - data, fops); - if (error) { - dentry = NULL; - simple_release_fs(&debugfs_mount, &debugfs_mount_count); - goto exit; + switch (mode & S_IFMT) { + case S_IFREG: + case 0: + break; + default: + BUG(); } -exit: - return dentry; + + return __create_file(name, mode, parent, data, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file); @@ -407,8 +406,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file); */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { - return debugfs_create_file(name, - S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + return __create_file(name, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, parent, NULL, NULL); } EXPORT_SYMBOL_GPL(debugfs_create_dir); @@ -446,8 +444,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, if (!link) return NULL; - result = debugfs_create_file(name, S_IFLNK | S_IRWXUGO, parent, link, - NULL); + result = __create_file(name, S_IFLNK | S_IRWXUGO, parent, link, NULL); if (!result) kfree(link); return result; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 979c1e309c7..14afbabe654 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -439,15 +439,15 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type, return ERR_PTR(error); if (opts.newinstance) - s = sget(fs_type, NULL, set_anon_super, NULL); + s = sget(fs_type, NULL, set_anon_super, flags, NULL); else - s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL); + s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, + NULL); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { - s->s_flags = flags; error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) goto out_undo_sget; diff --git a/fs/direct-io.c b/fs/direct-io.c index 0c85fae3766..1faf4cb56f3 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1258,7 +1258,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ BUG_ON(retval == -EIOCBQUEUED); if (dio->is_async && retval == 0 && dio->result && - ((rw & READ) || (dio->result == sdio.size))) + ((rw == READ) || (dio->result == sdio.size))) retval = -EIOCBQUEUED; if (retval != -EIOCBQUEUED) diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 534c1d46e69..1b5d9af937d 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -32,7 +32,7 @@ /** * ecryptfs_d_revalidate - revalidate an ecryptfs dentry * @dentry: The ecryptfs dentry - * @nd: The associated nameidata + * @flags: lookup flags * * Called when the VFS needs to revalidate a dentry. This * is called whenever a name lookup finds a dentry in the @@ -42,32 +42,20 @@ * Returns 1 if valid, 0 otherwise. * */ -static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *lower_dentry; struct vfsmount *lower_mnt; - struct dentry *dentry_save = NULL; - struct vfsmount *vfsmount_save = NULL; int rc = 1; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) goto out; - if (nd) { - dentry_save = nd->path.dentry; - vfsmount_save = nd->path.mnt; - nd->path.dentry = lower_dentry; - nd->path.mnt = lower_mnt; - } - rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); - if (nd) { - nd->path.dentry = dentry_save; - nd->path.mnt = vfsmount_save; - } + rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); if (dentry->d_inode) { struct inode *lower_inode = ecryptfs_inode_to_lower(dentry->d_inode); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 867b64c5d84..989e034f02b 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -550,20 +550,6 @@ extern struct kmem_cache *ecryptfs_key_record_cache; extern struct kmem_cache *ecryptfs_key_sig_cache; extern struct kmem_cache *ecryptfs_global_auth_tok_cache; extern struct kmem_cache *ecryptfs_key_tfm_cache; -extern struct kmem_cache *ecryptfs_open_req_cache; - -struct ecryptfs_open_req { -#define ECRYPTFS_REQ_PROCESSED 0x00000001 -#define ECRYPTFS_REQ_DROPPED 0x00000002 -#define ECRYPTFS_REQ_ZOMBIE 0x00000004 - u32 flags; - struct file **lower_file; - struct dentry *lower_dentry; - struct vfsmount *lower_mnt; - wait_queue_head_t wait; - struct mutex mux; - struct list_head kthread_ctl_list; -}; struct inode *ecryptfs_get_inode(struct inode *lower_inode, struct super_block *sb); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a07441a0a87..ffa2be57804 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -173,7 +173,7 @@ ecryptfs_do_create(struct inode *directory_inode, inode = ERR_CAST(lower_dir_dentry); goto out; } - rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, NULL); + rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -240,7 +240,6 @@ out: * @dir: The inode of the directory in which to create the file. * @dentry: The eCryptfs dentry * @mode: The mode of the new file. - * @nd: nameidata * * Creates a new file. * @@ -248,7 +247,7 @@ out: */ static int ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, - umode_t mode, struct nameidata *nd) + umode_t mode, bool excl) { struct inode *ecryptfs_inode; int rc; @@ -270,8 +269,8 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, iput(ecryptfs_inode); goto out; } - d_instantiate(ecryptfs_dentry, ecryptfs_inode); unlock_new_inode(ecryptfs_inode); + d_instantiate(ecryptfs_dentry, ecryptfs_inode); out: return rc; } @@ -374,7 +373,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, */ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, struct dentry *ecryptfs_dentry, - struct nameidata *ecryptfs_nd) + unsigned int flags) { char *encrypted_and_encoded_name = NULL; size_t encrypted_and_encoded_name_size; diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index 0dbe58a8b17..809e67d05ca 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -27,7 +27,12 @@ #include <linux/mount.h> #include "ecryptfs_kernel.h" -struct kmem_cache *ecryptfs_open_req_cache; +struct ecryptfs_open_req { + struct file **lower_file; + struct path path; + struct completion done; + struct list_head kthread_ctl_list; +}; static struct ecryptfs_kthread_ctl { #define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001 @@ -67,18 +72,10 @@ static int ecryptfs_threadfn(void *ignored) req = list_first_entry(&ecryptfs_kthread_ctl.req_list, struct ecryptfs_open_req, kthread_ctl_list); - mutex_lock(&req->mux); list_del(&req->kthread_ctl_list); - if (!(req->flags & ECRYPTFS_REQ_ZOMBIE)) { - dget(req->lower_dentry); - mntget(req->lower_mnt); - (*req->lower_file) = dentry_open( - req->lower_dentry, req->lower_mnt, - (O_RDWR | O_LARGEFILE), current_cred()); - req->flags |= ECRYPTFS_REQ_PROCESSED; - } - wake_up(&req->wait); - mutex_unlock(&req->mux); + *req->lower_file = dentry_open(&req->path, + (O_RDWR | O_LARGEFILE), current_cred()); + complete(&req->done); } mutex_unlock(&ecryptfs_kthread_ctl.mux); } @@ -111,10 +108,9 @@ void ecryptfs_destroy_kthread(void) ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE; list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list, kthread_ctl_list) { - mutex_lock(&req->mux); - req->flags |= ECRYPTFS_REQ_ZOMBIE; - wake_up(&req->wait); - mutex_unlock(&req->mux); + list_del(&req->kthread_ctl_list); + *req->lower_file = ERR_PTR(-EIO); + complete(&req->done); } mutex_unlock(&ecryptfs_kthread_ctl.mux); kthread_stop(ecryptfs_kthread); @@ -136,34 +132,26 @@ int ecryptfs_privileged_open(struct file **lower_file, struct vfsmount *lower_mnt, const struct cred *cred) { - struct ecryptfs_open_req *req; + struct ecryptfs_open_req req; int flags = O_LARGEFILE; int rc = 0; + init_completion(&req.done); + req.lower_file = lower_file; + req.path.dentry = lower_dentry; + req.path.mnt = lower_mnt; + /* Corresponding dput() and mntput() are done when the * lower file is fput() when all eCryptfs files for the inode are * released. */ - dget(lower_dentry); - mntget(lower_mnt); flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR; - (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred); + (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) goto out; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; } - req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL); - if (!req) { - rc = -ENOMEM; - goto out; - } - mutex_init(&req->mux); - req->lower_file = lower_file; - req->lower_dentry = lower_dentry; - req->lower_mnt = lower_mnt; - init_waitqueue_head(&req->wait); - req->flags = 0; mutex_lock(&ecryptfs_kthread_ctl.mux); if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) { rc = -EIO; @@ -171,27 +159,14 @@ int ecryptfs_privileged_open(struct file **lower_file, printk(KERN_ERR "%s: We are in the middle of shutting down; " "aborting privileged request to open lower file\n", __func__); - goto out_free; + goto out; } - list_add_tail(&req->kthread_ctl_list, &ecryptfs_kthread_ctl.req_list); + list_add_tail(&req.kthread_ctl_list, &ecryptfs_kthread_ctl.req_list); mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); - wait_event(req->wait, (req->flags != 0)); - mutex_lock(&req->mux); - BUG_ON(req->flags == 0); - if (req->flags & ECRYPTFS_REQ_DROPPED - || req->flags & ECRYPTFS_REQ_ZOMBIE) { - rc = -EIO; - printk(KERN_WARNING "%s: Privileged open request dropped\n", - __func__); - goto out_unlock; - } - if (IS_ERR(*req->lower_file)) - rc = PTR_ERR(*req->lower_file); -out_unlock: - mutex_unlock(&req->mux); -out_free: - kmem_cache_free(ecryptfs_open_req_cache, req); + wait_for_completion(&req.done); + if (IS_ERR(*lower_file)) + rc = PTR_ERR(*lower_file); out: return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 68954937a07..1c0b3b6b75c 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -499,13 +499,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } - s = sget(fs_type, NULL, set_anon_super, NULL); + s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) { rc = PTR_ERR(s); goto out; } - s->s_flags = flags; rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); if (rc) goto out1; @@ -682,11 +681,6 @@ static struct ecryptfs_cache_info { .name = "ecryptfs_key_tfm_cache", .size = sizeof(struct ecryptfs_key_tfm), }, - { - .cache = &ecryptfs_open_req_cache, - .name = "ecryptfs_open_req_cache", - .size = sizeof(struct ecryptfs_open_req), - }, }; static void ecryptfs_free_kmem_caches(void) diff --git a/fs/efs/efs.h b/fs/efs/efs.h index d8305b582ab..5528926ac7f 100644 --- a/fs/efs/efs.h +++ b/fs/efs/efs.h @@ -129,7 +129,7 @@ extern struct inode *efs_iget(struct super_block *, unsigned long); extern efs_block_t efs_map_block(struct inode *, efs_block_t); extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern struct dentry *efs_lookup(struct inode *, struct dentry *, struct nameidata *); +extern struct dentry *efs_lookup(struct inode *, struct dentry *, unsigned int); extern struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid, diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 832b10ded82..96f66d213a1 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -58,7 +58,8 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) return(0); } -struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { +struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ efs_ino_t inodenum; struct inode *inode = NULL; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 74598f67efe..1c8b5567080 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1710,7 +1710,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ - if ((epds.events & EPOLLWAKEUP) && !capable(CAP_EPOLLWAKEUP)) + if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) epds.events &= ~EPOLLWAKEUP; /* diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index fc7161d6bf6..4731fd991ef 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -46,7 +46,7 @@ static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode) } static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode; ino_t ino; @@ -60,7 +60,7 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, } static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct inode *inode = exofs_new_inode(dir, mode); int err = PTR_ERR(inode); diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 49cf230554a..24a49d47e93 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -735,13 +735,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) out: ios->numdevs = devs_in_group; ios->pages_consumed = cur_pg; - if (unlikely(ret)) { - if (length == ios->length) - return ret; - else - ios->length -= length; - } - return 0; + return ret; } int ore_create(struct ore_io_state *ios) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index d222c77cfa1..5f376d14fdc 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -144,26 +144,26 @@ static void _sp2d_reset(struct __stripe_pages_2d *sp2d, { unsigned data_devs = sp2d->data_devs; unsigned group_width = data_devs + sp2d->parity; - unsigned p; + int p, c; if (!sp2d->needed) return; - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (_1ps->write_count < group_width) { - unsigned c; + for (c = data_devs - 1; c >= 0; --c) + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - for (c = 0; c < data_devs; c++) - if (_1ps->page_is_read[c]) { - struct page *page = _1ps->pages[c]; + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; - r4w->put_page(priv, page); - _1ps->page_is_read[c] = false; - } + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } } + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); _1ps->write_count = 0; _1ps->tx = NULL; @@ -461,16 +461,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) * ios->sp2d[p][*], xor is calculated the same way. These pages are * allocated/freed and don't go through cache */ -static int _read_4_write(struct ore_io_state *ios) +static int _read_4_write_first_stripe(struct ore_io_state *ios) { - struct ore_io_state *ios_read; struct ore_striping_info read_si; struct __stripe_pages_2d *sp2d = ios->sp2d; u64 offset = ios->si.first_stripe_start; - u64 last_stripe_end; - unsigned bytes_in_stripe = ios->si.bytes_in_stripe; - unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; - int ret; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; if (offset == ios->offset) /* Go to start collect $200 */ goto read_last_stripe; @@ -478,6 +474,9 @@ static int _read_4_write(struct ore_io_state *ios) min_p = _sp2d_min_pg(sp2d); max_p = _sp2d_max_pg(sp2d); + ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", + offset, ios->offset, min_p, max_p); + for (c = 0; ; c++) { ore_calc_stripe_info(ios->layout, offset, 0, &read_si); read_si.obj_offset += min_p * PAGE_SIZE; @@ -512,6 +511,18 @@ static int _read_4_write(struct ore_io_state *ios) } read_last_stripe: + return 0; +} + +static int _read_4_write_last_stripe(struct ore_io_state *ios) +{ + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; + offset = ios->offset + ios->length; if (offset % PAGE_SIZE) _add_to_r4w_last_page(ios, &offset); @@ -527,15 +538,15 @@ read_last_stripe: c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); - BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); - /* unaligned IO must be within a single stripe */ - if (min_p == sp2d->pages_in_unit) { /* Didn't do it yet */ min_p = _sp2d_min_pg(sp2d); max_p = _sp2d_max_pg(sp2d); } + ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", + offset, last_stripe_end, min_p, max_p); + while (offset < last_stripe_end) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; @@ -568,6 +579,15 @@ read_last_stripe: } read_it: + return 0; +} + +static int _read_4_write_execute(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + unsigned i; + int ret; + ios_read = ios->ios_read_4_write; if (!ios_read) return 0; @@ -591,6 +611,8 @@ read_it: } _mark_read4write_pages_uptodate(ios_read, ret); + ore_put_io_state(ios_read); + ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ return 0; } @@ -626,8 +648,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios, /* If first stripe, Read in all read4write pages * (if needed) before we calculate the first parity. */ - _read_4_write(ios); + _read_4_write_first_stripe(ios); } + if (!cur_len) /* If last stripe r4w pages of last stripe */ + _read_4_write_last_stripe(ios); + _read_4_write_execute(ios); for (i = 0; i < num_pages; i++) { pages[i] = _raid_page_alloc(); @@ -654,34 +679,14 @@ int _ore_add_parity_unit(struct ore_io_state *ios, int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) { - struct ore_layout *layout = ios->layout; - if (ios->parity_pages) { + struct ore_layout *layout = ios->layout; unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; - unsigned stripe_size = ios->si.bytes_in_stripe; - u64 last_stripe, first_stripe; if (_sp2d_alloc(pages_in_unit, layout->group_width, layout->parity, &ios->sp2d)) { return -ENOMEM; } - - /* Round io down to last full strip */ - first_stripe = div_u64(ios->offset, stripe_size); - last_stripe = div_u64(ios->offset + ios->length, stripe_size); - - /* If an IO spans more then a single stripe it must end at - * a stripe boundary. The reminder at the end is pushed into the - * next IO. - */ - if (last_stripe != first_stripe) { - ios->length = last_stripe * stripe_size - ios->offset; - - BUG_ON(!ios->length); - ios->nr_pages = (ios->length + PAGE_SIZE - 1) / - PAGE_SIZE; - ios->si.length = ios->length; /*make it consistent */ - } } return 0; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index b0201ca6e9c..29ab099e3e0 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -19,19 +19,19 @@ #define dprintk(fmt, args...) do{}while(0) -static int get_name(struct vfsmount *mnt, struct dentry *dentry, char *name, - struct dentry *child); +static int get_name(const struct path *path, char *name, struct dentry *child); static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir, char *name, struct dentry *child) { const struct export_operations *nop = dir->d_sb->s_export_op; + struct path path = {.mnt = mnt, .dentry = dir}; if (nop->get_name) return nop->get_name(dir, name, child); else - return get_name(mnt, dir, name, child); + return get_name(&path, name, child); } /* @@ -44,13 +44,14 @@ find_acceptable_alias(struct dentry *result, { struct dentry *dentry, *toput = NULL; struct inode *inode; + struct hlist_node *p; if (acceptable(context, result)) return result; inode = result->d_inode; spin_lock(&inode->i_lock); - list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { dget(dentry); spin_unlock(&inode->i_lock); if (toput) @@ -248,11 +249,10 @@ static int filldir_one(void * __buf, const char * name, int len, * calls readdir on the parent until it finds an entry with * the same inode number as the child, and returns that. */ -static int get_name(struct vfsmount *mnt, struct dentry *dentry, - char *name, struct dentry *child) +static int get_name(const struct path *path, char *name, struct dentry *child) { const struct cred *cred = current_cred(); - struct inode *dir = dentry->d_inode; + struct inode *dir = path->dentry->d_inode; int error; struct file *file; struct getdents_callback buffer; @@ -266,7 +266,7 @@ static int get_name(struct vfsmount *mnt, struct dentry *dentry, /* * Open the directory ... */ - file = dentry_open(dget(dentry), mntget(mnt), O_RDONLY, cred); + file = dentry_open(path, O_RDONLY, cred); error = PTR_ERR(file); if (IS_ERR(file)) goto out; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index f663a67d7bf..73b0d951983 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -41,8 +41,8 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ext2_add_link(dentry, inode); if (!err) { - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -55,7 +55,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) * Methods themselves. */ -static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) { struct inode * inode; ino_t ino; @@ -94,7 +94,7 @@ struct dentry *ext2_get_parent(struct dentry *child) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) +static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; @@ -242,8 +242,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) if (err) goto out_fail; - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); out: return err; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b3621cb7ea3..9f311d27b16 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -771,13 +771,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) err = -ENOMEM; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - goto failed_unlock; + goto failed; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) { kfree(sbi); - goto failed_unlock; + goto failed; } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; @@ -1130,7 +1130,7 @@ failed_sbi: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); -failed_unlock: +failed: return ret; } @@ -1184,6 +1184,12 @@ static int ext2_sync_fs(struct super_block *sb, int wait) struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = EXT2_SB(sb)->s_es; + /* + * Write quota structures to quota file, sync_blockdev() will write + * them to disk later + */ + dquot_writeback_dquots(sb, -1); + spin_lock(&sbi->s_lock); if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { ext2_debug("setting valid to 0\n"); diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 92490e9f85c..c8fff930790 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -300,10 +300,11 @@ loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); + loff_t htree_max = ext3_get_htree_eof(file); if (likely(dx_dir)) return generic_file_llseek_size(file, offset, origin, - ext3_get_htree_eof(file)); + htree_max, htree_max); else return generic_file_llseek(file, offset, origin); } diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index d4dff278cbd..b31dbd4c46a 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -92,8 +92,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * disk caches manually so that data really is on persistent * storage */ - if (needs_barrier) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (needs_barrier) { + int err; + + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; + } out: trace_ext3_sync_file_exit(inode, ret); return ret; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index eeb63dfc5d2..8f4fddac01a 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1011,7 +1011,7 @@ errout: return NULL; } -static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) { struct inode * inode; struct ext3_dir_entry_2 * de; @@ -1671,8 +1671,8 @@ static int ext3_add_nondir(handle_t *handle, int err = ext3_add_entry(handle, dentry, inode); if (!err) { ext3_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); return 0; } drop_nlink(inode); @@ -1690,7 +1690,7 @@ static int ext3_add_nondir(handle_t *handle, * with d_instantiate(). */ static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode, - struct nameidata *nd) + bool excl) { handle_t *handle; struct inode * inode; @@ -1836,8 +1836,8 @@ out_clear_inode: if (err) goto out_clear_inode; - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); out_stop: brelse(dir_block); ext3_journal_stop(handle); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 8c3a44b7c37..ff9bcdc5b0d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2058,7 +2058,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount3; } - ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); + if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); @@ -2526,6 +2527,11 @@ static int ext3_sync_fs(struct super_block *sb, int wait) tid_t target; trace_ext3_sync_fs(sb, wait); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(sb, -1); if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index aa39e600d15..8e07d2a5a13 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -324,74 +324,27 @@ static inline loff_t ext4_get_htree_eof(struct file *filp) /* - * ext4_dir_llseek() based on generic_file_llseek() to handle both - * non-htree and htree directories, where the "offset" is in terms - * of the filename hash value instead of the byte offset. + * ext4_dir_llseek() calls generic_file_llseek_size to handle htree + * directories, where the "offset" is in terms of the filename hash + * value instead of the byte offset. * - * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX) - * will be invalid once the directory was converted into a dx directory + * Because we may return a 64-bit hash that is well beyond offset limits, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * For non-htree, ext4_llseek already chooses the proper max offset. */ loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) { struct inode *inode = file->f_mapping->host; - loff_t ret = -EINVAL; int dx_dir = is_dx_dir(inode); + loff_t htree_max = ext4_get_htree_eof(file); - mutex_lock(&inode->i_mutex); - - /* NOTE: relative offsets with dx directories might not work - * as expected, as it is difficult to figure out the - * correct offset between dx hashes */ - - switch (origin) { - case SEEK_END: - if (unlikely(offset > 0)) - goto out_err; /* not supported for directories */ - - /* so only negative offsets are left, does that have a - * meaning for directories at all? */ - if (dx_dir) - offset += ext4_get_htree_eof(file); - else - offset += inode->i_size; - break; - case SEEK_CUR: - /* - * Here we special-case the lseek(fd, 0, SEEK_CUR) - * position-querying operation. Avoid rewriting the "same" - * f_pos value back to the file because a concurrent read(), - * write() or lseek() might have altered it - */ - if (offset == 0) { - offset = file->f_pos; - goto out_ok; - } - - offset += file->f_pos; - break; - } - - if (unlikely(offset < 0)) - goto out_err; - - if (!dx_dir) { - if (offset > inode->i_sb->s_maxbytes) - goto out_err; - } else if (offset > ext4_get_htree_eof(file)) - goto out_err; - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - -out_ok: - ret = offset; -out_err: - mutex_unlock(&inode->i_mutex); - - return ret; + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, origin, + htree_max, htree_max); + else + return ext4_llseek(file, offset, origin); } /* diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8c7642a0005..782eecb57e4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -211,9 +211,9 @@ static int ext4_file_open(struct inode * inode, struct file * filp) } /* - * ext4_llseek() copied from generic_file_llseek() to handle both - * block-mapped and extent-mapped maxbytes values. This should - * otherwise be identical with generic_file_llseek(). + * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values + * by calling generic_file_llseek_size() with the appropriate maxbytes + * value for each. */ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) { @@ -225,7 +225,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - return generic_file_llseek_size(file, offset, origin, maxbytes); + return generic_file_llseek_size(file, offset, origin, + maxbytes, i_size_read(inode)); } const struct file_operations ext4_file_operations = { diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index bb6c7d81131..2a1dcea4f12 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -135,14 +135,7 @@ static int ext4_sync_parent(struct inode *inode) inode = igrab(inode); while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = NULL; - spin_lock(&inode->i_lock); - if (!list_empty(&inode->i_dentry)) { - dentry = list_first_entry(&inode->i_dentry, - struct dentry, d_alias); - dget(dentry); - } - spin_unlock(&inode->i_lock); + dentry = d_find_any_alias(inode); if (!dentry) break; next = igrab(dentry->d_parent->d_inode); @@ -232,7 +225,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!journal) { ret = __sync_inode(inode, datasync); - if (!ret && !list_empty(&inode->i_dentry)) + if (!ret && !hlist_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); goto out; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e34deac3f36..7f7dad78760 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -268,7 +268,6 @@ group_extend_out: err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); - mnt_drop_write(filp->f_path.mnt); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) @@ -390,7 +389,7 @@ group_add_out: if (err) return err; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) goto resizefs_out; @@ -402,7 +401,7 @@ group_add_out: } if (err == 0) err = err2; - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); resizefs_out: ext4_resize_end(sb); return err; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5845cd97bf8..d0d3f0e87f9 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1312,7 +1312,7 @@ errout: return NULL; } -static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct ext4_dir_entry_2 *de; @@ -2072,8 +2072,8 @@ static int ext4_add_nondir(handle_t *handle, int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); return 0; } drop_nlink(inode); @@ -2091,7 +2091,7 @@ static int ext4_add_nondir(handle_t *handle, * with d_instantiate(). */ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { handle_t *handle; struct inode *inode; @@ -2249,8 +2249,8 @@ out_clear_inode: err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); out_stop: brelse(dir_block); ext4_journal_stop(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eb7aa3e4ef0..d8759401eca 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4325,6 +4325,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait) trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->dio_unwritten_wq); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(sb, -1); if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) jbd2_log_wait_commit(sbi->s_journal, target); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index c5938c9084b..70d993a9380 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -201,7 +201,7 @@ static const struct dentry_operations msdos_dentry_operations = { /***** Get inode using directory and name */ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; @@ -265,7 +265,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, /***** Create a file */ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode = NULL; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 98ae804f527..6cc48065243 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -41,9 +41,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry) return ret; } -static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) +static int vfat_revalidate(struct dentry *dentry, unsigned int flags) { - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; /* This is not negative dentry. Always valid. */ @@ -52,9 +52,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) return vfat_revalidate_shortname(dentry); } -static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) +static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags) { - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; /* @@ -74,7 +74,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) * This may be nfsd (or something), anyway, we can't see the * intent of this. So, since this can be for creation, drop it. */ - if (!nd) + if (!flags) return 0; /* @@ -82,7 +82,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) * case sensitive name which is specified by user if this is * for creation. */ - if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; return vfat_revalidate_shortname(dentry); @@ -714,7 +714,7 @@ static int vfat_d_anon_disconn(struct dentry *dentry) } static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; @@ -772,7 +772,7 @@ error: } static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode; diff --git a/fs/fifo.c b/fs/fifo.c index b1a524d798e..cf6f4345ceb 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -14,7 +14,7 @@ #include <linux/sched.h> #include <linux/pipe_fs_i.h> -static void wait_for_partner(struct inode* inode, unsigned int *cnt) +static int wait_for_partner(struct inode* inode, unsigned int *cnt) { int cur = *cnt; @@ -23,6 +23,7 @@ static void wait_for_partner(struct inode* inode, unsigned int *cnt) if (signal_pending(current)) break; } + return cur == *cnt ? -ERESTARTSYS : 0; } static void wake_up_partner(struct inode* inode) @@ -67,8 +68,7 @@ static int fifo_open(struct inode *inode, struct file *filp) * seen a writer */ filp->f_version = pipe->w_counter; } else { - wait_for_partner(inode, &pipe->w_counter); - if(signal_pending(current)) + if (wait_for_partner(inode, &pipe->w_counter)) goto err_rd; } } @@ -90,8 +90,7 @@ static int fifo_open(struct inode *inode, struct file *filp) wake_up_partner(inode); if (!pipe->readers) { - wait_for_partner(inode, &pipe->r_counter); - if (signal_pending(current)) + if (wait_for_partner(inode, &pipe->r_counter)) goto err_wr; } break; diff --git a/fs/file_table.c b/fs/file_table.c index a305d9e2d1b..b3fc4d67a26 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -23,6 +23,8 @@ #include <linux/lglock.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/task_work.h> #include <linux/ima.h> #include <linux/atomic.h> @@ -251,7 +253,6 @@ static void __fput(struct file *file) } fops_put(file->f_op); put_pid(file->f_owner.pid); - file_sb_list_del(file); if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); if (file->f_mode & FMODE_WRITE) @@ -263,10 +264,77 @@ static void __fput(struct file *file) mntput(mnt); } +static DEFINE_SPINLOCK(delayed_fput_lock); +static LIST_HEAD(delayed_fput_list); +static void delayed_fput(struct work_struct *unused) +{ + LIST_HEAD(head); + spin_lock_irq(&delayed_fput_lock); + list_splice_init(&delayed_fput_list, &head); + spin_unlock_irq(&delayed_fput_lock); + while (!list_empty(&head)) { + struct file *f = list_first_entry(&head, struct file, f_u.fu_list); + list_del_init(&f->f_u.fu_list); + __fput(f); + } +} + +static void ____fput(struct callback_head *work) +{ + __fput(container_of(work, struct file, f_u.fu_rcuhead)); +} + +/* + * If kernel thread really needs to have the final fput() it has done + * to complete, call this. The only user right now is the boot - we + * *do* need to make sure our writes to binaries on initramfs has + * not left us with opened struct file waiting for __fput() - execve() + * won't work without that. Please, don't add more callers without + * very good reasons; in particular, never call that with locks + * held and never call that from a thread that might need to do + * some work on any kind of umount. + */ +void flush_delayed_fput(void) +{ + delayed_fput(NULL); +} + +static DECLARE_WORK(delayed_fput_work, delayed_fput); + void fput(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) + if (atomic_long_dec_and_test(&file->f_count)) { + struct task_struct *task = current; + file_sb_list_del(file); + if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) { + unsigned long flags; + spin_lock_irqsave(&delayed_fput_lock, flags); + list_add(&file->f_u.fu_list, &delayed_fput_list); + schedule_work(&delayed_fput_work); + spin_unlock_irqrestore(&delayed_fput_lock, flags); + return; + } + init_task_work(&file->f_u.fu_rcuhead, ____fput); + task_work_add(task, &file->f_u.fu_rcuhead, true); + } +} + +/* + * synchronous analog of fput(); for kernel threads that might be needed + * in some umount() (and thus can't use flush_delayed_fput() without + * risking deadlocks), need to wait for completion of __fput() and know + * for this specific struct file it won't involve anything that would + * need them. Use only if you really need it - at the very least, + * don't blindly convert fput() by kernel thread to that. + */ +void __fput_sync(struct file *file) +{ + if (atomic_long_dec_and_test(&file->f_count)) { + struct task_struct *task = current; + file_sb_list_del(file); + BUG_ON(!(task->flags & PF_KTHREAD)); __fput(file); + } } EXPORT_SYMBOL(fput); @@ -483,10 +551,8 @@ void mark_files_ro(struct super_block *sb) { struct file *f; -retry: lg_global_lock(&files_lglock); do_file_list_for_each_entry(sb, f) { - struct vfsmount *mnt; if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) continue; if (!file_count(f)) @@ -499,12 +565,7 @@ retry: if (file_check_writeable(f) != 0) continue; file_release_write(f); - mnt = mntget(f->f_path.mnt); - /* This can sleep, so we can't hold the spinlock. */ - lg_global_unlock(&files_lglock); - mnt_drop_write(mnt); - mntput(mnt); - goto retry; + mnt_drop_write_file(f); } while_file_list_for_each_entry; lg_global_unlock(&files_lglock); } diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 3360f1e678a..bd447e88f20 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -48,7 +48,7 @@ #define VXFS_BLOCK_PER_PAGE(sbp) ((PAGE_CACHE_SIZE / (sbp)->s_blocksize)) -static struct dentry * vxfs_lookup(struct inode *, struct dentry *, struct nameidata *); +static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int); static int vxfs_readdir(struct file *, void *, filldir_t); const struct inode_operations vxfs_dir_inode_ops = { @@ -203,7 +203,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp) * in the return pointer. */ static struct dentry * -vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd) +vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags) { struct inode *ip = NULL; ino_t ino; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 41a3ccff18d..8f660dd6137 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1315,6 +1315,8 @@ void writeback_inodes_sb_nr(struct super_block *sb, .reason = reason, }; + if (sb->s_bdi == &noop_backing_dev_info) + return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); bdi_queue_work(sb->s_bdi, &work); wait_for_completion(&done); @@ -1398,6 +1400,9 @@ void sync_inodes_sb(struct super_block *sb) .reason = WB_REASON_SYNC, }; + /* Nothing to do? */ + if (sb->s_bdi == &noop_backing_dev_info) + return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); bdi_queue_work(sb->s_bdi, &work); diff --git a/fs/fs_struct.c b/fs/fs_struct.c index e159e682ad4..5df4775fea0 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -6,18 +6,6 @@ #include <linux/fs_struct.h> #include "internal.h" -static inline void path_get_longterm(struct path *path) -{ - path_get(path); - mnt_make_longterm(path->mnt); -} - -static inline void path_put_longterm(struct path *path) -{ - mnt_make_shortterm(path->mnt); - path_put(path); -} - /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. @@ -26,7 +14,7 @@ void set_fs_root(struct fs_struct *fs, struct path *path) { struct path old_root; - path_get_longterm(path); + path_get(path); spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); old_root = fs->root; @@ -34,7 +22,7 @@ void set_fs_root(struct fs_struct *fs, struct path *path) write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_root.dentry) - path_put_longterm(&old_root); + path_put(&old_root); } /* @@ -45,7 +33,7 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path) { struct path old_pwd; - path_get_longterm(path); + path_get(path); spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); old_pwd = fs->pwd; @@ -54,7 +42,7 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path) spin_unlock(&fs->lock); if (old_pwd.dentry) - path_put_longterm(&old_pwd); + path_put(&old_pwd); } static inline int replace_path(struct path *p, const struct path *old, const struct path *new) @@ -84,7 +72,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root) write_seqcount_end(&fs->seq); while (hits--) { count++; - path_get_longterm(new_root); + path_get(new_root); } spin_unlock(&fs->lock); } @@ -92,13 +80,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root) } while_each_thread(g, p); read_unlock(&tasklist_lock); while (count--) - path_put_longterm(old_root); + path_put(old_root); } void free_fs_struct(struct fs_struct *fs) { - path_put_longterm(&fs->root); - path_put_longterm(&fs->pwd); + path_put(&fs->root); + path_put(&fs->pwd); kmem_cache_free(fs_cachep, fs); } @@ -132,9 +120,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) spin_lock(&old->lock); fs->root = old->root; - path_get_longterm(&fs->root); + path_get(&fs->root); fs->pwd = old->pwd; - path_get_longterm(&fs->pwd); + path_get(&fs->pwd); spin_unlock(&old->lock); } return fs; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 334e0b18a01..8964cf3999b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -154,7 +154,7 @@ u64 fuse_get_attr_version(struct fuse_conn *fc) * the lookup once more. If the lookup results in the same inode, * then refresh the attributes, timeouts and mark the dentry valid. */ -static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) +static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; @@ -174,7 +174,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) if (!inode) return 0; - if (nd && (nd->flags & LOOKUP_RCU)) + if (flags & LOOKUP_RCU) return -ECHILD; fc = get_fuse_conn(inode); @@ -249,7 +249,7 @@ static struct dentry *fuse_d_add_directory(struct dentry *entry, /* This tries to shrink the subtree below alias */ fuse_invalidate_entry(alias); dput(alias); - if (!list_empty(&inode->i_dentry)) + if (!hlist_empty(&inode->i_dentry)) return ERR_PTR(-EBUSY); } else { dput(alias); @@ -316,7 +316,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, } static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, - struct nameidata *nd) + unsigned int flags) { int err; struct fuse_entry_out outarg; @@ -370,7 +370,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, * 'mknod' + 'open' requests. */ static int fuse_create_open(struct inode *dir, struct dentry *entry, - umode_t mode, struct nameidata *nd) + struct file *file, unsigned flags, + umode_t mode, int *opened) { int err; struct inode *inode; @@ -381,15 +382,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_open_out outopen; struct fuse_entry_out outentry; struct fuse_file *ff; - struct file *file; - int flags = nd->intent.open.flags; - - if (fc->no_create) - return -ENOSYS; forget = fuse_alloc_forget(); + err = -ENOMEM; if (!forget) - return -ENOMEM; + goto out_err; req = fuse_get_req(fc); err = PTR_ERR(req); @@ -428,11 +425,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, req->out.args[1].value = &outopen; fuse_request_send(fc, req); err = req->out.h.error; - if (err) { - if (err == -ENOSYS) - fc->no_create = 1; + if (err) goto out_free_ff; - } err = -EIO; if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) @@ -448,28 +442,74 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(ff, flags); fuse_queue_forget(fc, forget, outentry.nodeid, 1); - return -ENOMEM; + err = -ENOMEM; + goto out_err; } kfree(forget); d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_invalidate_attr(dir); - file = lookup_instantiate_filp(nd, entry, generic_file_open); - if (IS_ERR(file)) { + err = finish_open(file, entry, generic_file_open, opened); + if (err) { fuse_sync_release(ff, flags); - return PTR_ERR(file); + } else { + file->private_data = fuse_file_get(ff); + fuse_finish_open(inode, file); } - file->private_data = fuse_file_get(ff); - fuse_finish_open(inode, file); - return 0; + return err; - out_free_ff: +out_free_ff: fuse_file_free(ff); - out_put_request: +out_put_request: fuse_put_request(fc, req); - out_put_forget_req: +out_put_forget_req: kfree(forget); +out_err: + return err; +} + +static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t); +static int fuse_atomic_open(struct inode *dir, struct dentry *entry, + struct file *file, unsigned flags, + umode_t mode, int *opened) +{ + int err; + struct fuse_conn *fc = get_fuse_conn(dir); + struct dentry *res = NULL; + + if (d_unhashed(entry)) { + res = fuse_lookup(dir, entry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + entry = res; + } + + if (!(flags & O_CREAT) || entry->d_inode) + goto no_open; + + /* Only creates */ + *opened |= FILE_CREATED; + + if (fc->no_create) + goto mknod; + + err = fuse_create_open(dir, entry, file, flags, mode, opened); + if (err == -ENOSYS) { + fc->no_create = 1; + goto mknod; + } +out_dput: + dput(res); return err; + +mknod: + err = fuse_mknod(dir, entry, mode, 0); + if (err) + goto out_dput; +no_open: + return finish_no_open(file, res); } /* @@ -571,14 +611,8 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, } static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, - struct nameidata *nd) + bool excl) { - if (nd) { - int err = fuse_create_open(dir, entry, mode, nd); - if (err != -ENOSYS) - return err; - /* Fall back on mknod */ - } return fuse_mknod(dir, entry, mode, 0); } @@ -1646,6 +1680,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, + .atomic_open = fuse_atomic_open, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index e80a464850c..d6526347d38 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -614,7 +614,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; - struct gfs2_qadata *qa = NULL; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned from = pos & (PAGE_CACHE_SIZE - 1); struct page *page; @@ -638,15 +637,9 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); if (alloc_required) { - qa = gfs2_qadata_get(ip); - if (!qa) { - error = -ENOMEM; - goto out_unlock; - } - error = gfs2_quota_lock_check(ip); if (error) - goto out_alloc_put; + goto out_unlock; error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); if (error) @@ -708,8 +701,6 @@ out_trans_fail: gfs2_inplace_release(ip); out_qunlock: gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_qadata_put(ip); } out_unlock: if (&ip->i_inode == sdp->sd_rindex) { @@ -846,7 +837,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct buffer_head *dibh; - struct gfs2_qadata *qa = ip->i_qadata; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); unsigned int to = from + len; int ret; @@ -878,12 +868,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, brelse(dibh); failed: gfs2_trans_end(sdp); - if (ip->i_res) + if (gfs2_mb_reserved(ip)) gfs2_inplace_release(ip); - if (qa) { + if (ip->i_res->rs_qa_qd_num) gfs2_quota_unlock(ip); - gfs2_qadata_put(ip); - } if (inode == sdp->sd_rindex) { gfs2_glock_dq(&m_ip->i_gh); gfs2_holder_uninit(&m_ip->i_gh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index dab54099dd9..49cd7dd4a9f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (error) goto out_rlist; + if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ + gfs2_rs_deltree(ip->i_res); + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + RES_STATFS + RES_QUOTA, revokes); @@ -1045,12 +1048,13 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; find_metapath(sdp, lblock, &mp, ip->i_height); - if (!gfs2_qadata_get(ip)) - return -ENOMEM; + error = gfs2_rindex_update(sdp); + if (error) + return error; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) - goto out; + return error; while (height--) { struct strip_mine sm; @@ -1064,8 +1068,6 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) gfs2_quota_unhold(ip); -out: - gfs2_qadata_put(ip); return error; } @@ -1167,19 +1169,14 @@ static int do_grow(struct inode *inode, u64 size) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh; - struct gfs2_qadata *qa = NULL; int error; int unstuff = 0; if (gfs2_is_stuffed(ip) && (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { - qa = gfs2_qadata_get(ip); - if (qa == NULL) - return -ENOMEM; - error = gfs2_quota_lock_check(ip); if (error) - goto do_grow_alloc_put; + return error; error = gfs2_inplace_reserve(ip, 1); if (error) @@ -1214,8 +1211,6 @@ do_grow_release: gfs2_inplace_release(ip); do_grow_qunlock: gfs2_quota_unlock(ip); -do_grow_alloc_put: - gfs2_qadata_put(ip); } return error; } diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 0da8da2c991..4fddb3c22d2 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -25,7 +25,7 @@ /** * gfs2_drevalidate - Check directory lookup consistency * @dentry: the mapping to check - * @nd: + * @flags: lookup flags * * Check to make sure the lookup necessary to arrive at this inode from its * parent is still good. @@ -33,7 +33,7 @@ * Returns: 1 if the dentry is ok, 0 if it isn't */ -static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) +static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) { struct dentry *parent; struct gfs2_sbd *sdp; @@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) int error; int had_lock = 0; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; parent = dget_parent(dentry); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 8aaeb07a07b..259b088cfc4 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1854,14 +1854,9 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (!ht) return -ENOMEM; - if (!gfs2_qadata_get(dip)) { - error = -ENOMEM; - goto out; - } - error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) - goto out_put; + goto out; /* Count the number of leaves */ bh = leaf_bh; @@ -1942,8 +1937,6 @@ out_rg_gunlock: out_rlist: gfs2_rlist_free(&rlist); gfs2_quota_unhold(dip); -out_put: - gfs2_qadata_put(dip); out: kfree(ht); return error; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 31b199f6efc..9aa6af13823 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -142,6 +142,7 @@ static const u32 fsflags_to_gfs2[32] = { [7] = GFS2_DIF_NOATIME, [12] = GFS2_DIF_EXHASH, [14] = GFS2_DIF_INHERIT_JDATA, + [17] = GFS2_DIF_TOPDIR, }; static const u32 gfs2_to_fsflags[32] = { @@ -150,6 +151,7 @@ static const u32 gfs2_to_fsflags[32] = { [gfs2fl_AppendOnly] = FS_APPEND_FL, [gfs2fl_NoAtime] = FS_NOATIME_FL, [gfs2fl_ExHash] = FS_INDEX_FL, + [gfs2fl_TopLevel] = FS_TOPDIR_FL, [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, }; @@ -203,6 +205,7 @@ void gfs2_set_inode_flags(struct inode *inode) GFS2_DIF_NOATIME| \ GFS2_DIF_SYNC| \ GFS2_DIF_SYSTEM| \ + GFS2_DIF_TOPDIR| \ GFS2_DIF_INHERIT_JDATA) /** @@ -298,6 +301,7 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); if (!S_ISDIR(inode->i_mode)) { + gfsflags &= ~GFS2_DIF_TOPDIR; if (gfsflags & GFS2_DIF_INHERIT_JDATA) gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); return do_gfs2_set_flags(filp, gfsflags, ~0); @@ -366,7 +370,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; - struct gfs2_qadata *qa; loff_t size; int ret; @@ -376,6 +379,13 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) */ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + ret = gfs2_rs_alloc(ip); + if (ret) + return ret; + + atomic_set(&ip->i_res->rs_sizehint, + PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) @@ -393,14 +403,13 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out_unlock; } - ret = -ENOMEM; - qa = gfs2_qadata_get(ip); - if (qa == NULL) + ret = gfs2_rindex_update(sdp); + if (ret) goto out_unlock; ret = gfs2_quota_lock_check(ip); if (ret) - goto out_alloc_put; + goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); if (ret) @@ -447,8 +456,6 @@ out_trans_fail: gfs2_inplace_release(ip); out_quota_unlock: gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_qadata_put(ip); out_unlock: gfs2_glock_dq(&gh); out: @@ -567,16 +574,14 @@ fail: static int gfs2_release(struct inode *inode, struct file *file) { - struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; - struct gfs2_file *fp; + struct gfs2_inode *ip = GFS2_I(inode); - fp = file->private_data; + kfree(file->private_data); file->private_data = NULL; - if (gfs2_assert_warn(sdp, fp)) - return -EIO; - - kfree(fp); + if ((file->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1)) + gfs2_rs_delete(ip); return 0; } @@ -653,12 +658,20 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + size_t writesize = iov_length(iov, nr_segs); + struct dentry *dentry = file->f_dentry; + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_sbd *sdp; + int ret; + sdp = GFS2_SB(file->f_mapping->host); + ret = gfs2_rs_alloc(ip); + if (ret) + return ret; + + atomic_set(&ip->i_res->rs_sizehint, writesize >> sdp->sd_sb.sb_bsize_shift); if (file->f_flags & O_APPEND) { - struct dentry *dentry = file->f_dentry; - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); struct gfs2_holder gh; - int ret; ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) @@ -751,7 +764,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, struct gfs2_inode *ip = GFS2_I(inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; loff_t bytes, max_bytes; - struct gfs2_qadata *qa; int error; const loff_t pos = offset; const loff_t count = len; @@ -774,11 +786,17 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, if (bytes == 0) bytes = sdp->sd_sb.sb_bsize; + error = gfs2_rs_alloc(ip); + if (error) + return error; + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); error = gfs2_glock_nq(&ip->i_gh); if (unlikely(error)) goto out_uninit; + atomic_set(&ip->i_res->rs_sizehint, len >> sdp->sd_sb.sb_bsize_shift); + while (len > 0) { if (len < bytes) bytes = len; @@ -787,15 +805,9 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, offset += bytes; continue; } - qa = gfs2_qadata_get(ip); - if (!qa) { - error = -ENOMEM; - goto out_unlock; - } - error = gfs2_quota_lock_check(ip); if (error) - goto out_alloc_put; + goto out_unlock; retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); @@ -835,7 +847,6 @@ retry: offset += max_bytes; gfs2_inplace_release(ip); gfs2_quota_unlock(ip); - gfs2_qadata_put(ip); } if (error == 0) @@ -846,8 +857,6 @@ out_trans_fail: gfs2_inplace_release(ip); out_qunlock: gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_qadata_put(ip); out_unlock: gfs2_glock_dq(&ip->i_gh); out_uninit: diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index dab2526071c..1ed81f40da0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -46,10 +46,11 @@ #include "trace_gfs2.h" struct gfs2_glock_iter { - int hash; /* hash bucket index */ - struct gfs2_sbd *sdp; /* incore superblock */ - struct gfs2_glock *gl; /* current glock struct */ - char string[512]; /* scratch space */ + int hash; /* hash bucket index */ + unsigned nhash; /* Index within current bucket */ + struct gfs2_sbd *sdp; /* incore superblock */ + struct gfs2_glock *gl; /* current glock struct */ + loff_t last_pos; /* last position */ }; typedef void (*glock_examiner) (struct gfs2_glock * gl); @@ -767,6 +768,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); + memset(gl->gl_lvb, 0, 32 * sizeof(char)); gl->gl_lksb.sb_lvbptr = gl->gl_lvb; gl->gl_tchange = jiffies; gl->gl_object = NULL; @@ -948,9 +950,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) va_start(args, fmt); if (seq) { - struct gfs2_glock_iter *gi = seq->private; - vsprintf(gi->string, fmt, args); - seq_printf(seq, gi->string); + seq_vprintf(seq, fmt, args); } else { vaf.fmt = fmt; vaf.va = &args; @@ -1854,8 +1854,14 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) gl = gi->gl; if (gl) { gi->gl = glock_hash_next(gl); + gi->nhash++; } else { + if (gi->hash >= GFS2_GL_HASH_SIZE) { + rcu_read_unlock(); + return 1; + } gi->gl = glock_hash_chain(gi->hash); + gi->nhash = 0; } while (gi->gl == NULL) { gi->hash++; @@ -1864,6 +1870,7 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) return 1; } gi->gl = glock_hash_chain(gi->hash); + gi->nhash = 0; } /* Skip entries for other sb and dead entries */ } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0); @@ -1876,7 +1883,12 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; - gi->hash = 0; + if (gi->last_pos <= *pos) + n = gi->nhash + (*pos - gi->last_pos); + else + gi->hash = 0; + + gi->nhash = 0; rcu_read_lock(); do { @@ -1884,6 +1896,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) return NULL; } while (n--); + gi->last_pos = *pos; return gi->gl; } @@ -1893,7 +1906,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, struct gfs2_glock_iter *gi = seq->private; (*pos)++; - + gi->last_pos = *pos; if (gfs2_glock_iter_next(gi)) return NULL; @@ -1964,6 +1977,8 @@ static const struct seq_operations gfs2_sbstats_seq_ops = { .show = gfs2_sbstats_seq_show, }; +#define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) + static int gfs2_glocks_open(struct inode *inode, struct file *file) { int ret = seq_open_private(file, &gfs2_glock_seq_ops, @@ -1972,6 +1987,9 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; + seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); + if (seq->buf) + seq->size = GFS2_SEQ_GOODSIZE; } return ret; } @@ -1984,6 +2002,9 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; + seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); + if (seq->buf) + seq->size = GFS2_SEQ_GOODSIZE; } return ret; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 67fd6beffec..aaecc8085fc 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -84,17 +84,22 @@ struct gfs2_rgrpd { u32 rd_data; /* num of data blocks in rgrp */ u32 rd_bitbytes; /* number of bytes in data bitmaps */ u32 rd_free; + u32 rd_reserved; /* number of blocks reserved */ u32 rd_free_clone; u32 rd_dinodes; u64 rd_igeneration; struct gfs2_bitmap *rd_bits; struct gfs2_sbd *rd_sbd; + struct gfs2_rgrp_lvb *rd_rgl; u32 rd_last_alloc; u32 rd_flags; #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ + spinlock_t rd_rsspin; /* protects reservation related vars */ + struct rb_root rd_rstree; /* multi-block reservation tree */ + u32 rd_rs_cnt; /* count of current reservations */ }; enum gfs2_state_bits { @@ -232,6 +237,38 @@ struct gfs2_holder { unsigned long gh_ip; }; +/* Resource group multi-block reservation, in order of appearance: + + Step 1. Function prepares to write, allocates a mb, sets the size hint. + Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info + Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use + Step 4. Bits are assigned from the rgrp based on either the reservation + or wherever it can. +*/ + +struct gfs2_blkreserv { + /* components used during write (step 1): */ + atomic_t rs_sizehint; /* hint of the write size */ + + /* components used during inplace_reserve (step 2): */ + u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */ + + /* components used during get_local_rgrp (step 3): */ + struct gfs2_rgrpd *rs_rgd; /* pointer to the gfs2_rgrpd */ + struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */ + struct rb_node rs_node; /* link to other block reservations */ + + /* components used during block searches and assignments (step 4): */ + struct gfs2_bitmap *rs_bi; /* bitmap for the current allocation */ + u32 rs_biblk; /* start block relative to the bi */ + u32 rs_free; /* how many blocks are still free */ + + /* ancillary quota stuff */ + struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS]; + struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS]; + unsigned int rs_qa_qd_num; +}; + enum { GLF_LOCK = 1, GLF_DEMOTE = 3, @@ -289,18 +326,6 @@ struct gfs2_glock { #define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ -struct gfs2_qadata { /* quota allocation data */ - /* Quota stuff */ - struct gfs2_quota_data *qa_qd[2*MAXQUOTAS]; - struct gfs2_holder qa_qd_ghs[2*MAXQUOTAS]; - unsigned int qa_qd_num; -}; - -struct gfs2_blkreserv { - u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */ - struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */ -}; - enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, @@ -308,7 +333,6 @@ enum { GIF_SW_PAGED = 3, }; - struct gfs2_inode { struct inode i_inode; u64 i_no_addr; @@ -319,8 +343,7 @@ struct gfs2_inode { struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ - struct gfs2_qadata *i_qadata; /* quota allocation data */ - struct gfs2_blkreserv *i_res; /* resource group block reservation */ + struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */ struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; @@ -473,6 +496,7 @@ struct gfs2_args { unsigned int ar_discard:1; /* discard requests */ unsigned int ar_errors:2; /* errors=withdraw | panic */ unsigned int ar_nobarrier:1; /* do not send barriers */ + unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ int ar_commit; /* Commit interval */ int ar_statfs_quantum; /* The fast statfs interval */ int ar_quota_quantum; /* The quota interval */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index a9ba2444e07..4ce22e54730 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -521,12 +521,13 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, int error; munge_mode_uid_gid(dip, &mode, &uid, &gid); - if (!gfs2_qadata_get(dip)) - return -ENOMEM; + error = gfs2_rindex_update(sdp); + if (error) + return error; error = gfs2_quota_lock(dip, uid, gid); if (error) - goto out; + return error; error = gfs2_quota_check(dip, uid, gid); if (error) @@ -542,8 +543,6 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, out_quota: gfs2_quota_unlock(dip); -out: - gfs2_qadata_put(dip); return error; } @@ -551,14 +550,13 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_qadata *qa; int alloc_required; struct buffer_head *dibh; int error; - qa = gfs2_qadata_get(dip); - if (!qa) - return -ENOMEM; + error = gfs2_rindex_update(sdp); + if (error) + return error; error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) @@ -605,13 +603,13 @@ fail_end_trans: gfs2_trans_end(sdp); fail_ipreserv: - gfs2_inplace_release(dip); + if (alloc_required) + gfs2_inplace_release(dip); fail_quota_locks: gfs2_quota_unlock(dip); fail: - gfs2_qadata_put(dip); return error; } @@ -657,7 +655,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, const struct qstr *name = &dentry->d_name; struct gfs2_holder ghs[2]; struct inode *inode = NULL; - struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; int error; @@ -667,6 +665,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; + /* We need a reservation to allocate the new dinode block. The + directory ip temporarily points to the reservation, but this is + being done to get a set of contiguous blocks for the new dinode. + Since this is a create, we don't have a sizehint yet, so it will + have to use the minimum reservation size. */ + error = gfs2_rs_alloc(dip); + if (error) + return error; + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) goto fail; @@ -700,19 +707,29 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto fail_gunlock2; - error = gfs2_inode_refresh(GFS2_I(inode)); + ip = GFS2_I(inode); + error = gfs2_inode_refresh(ip); if (error) goto fail_gunlock2; + /* The newly created inode needs a reservation so it can allocate + xattrs. At the same time, we want new blocks allocated to the new + dinode to be as contiguous as possible. Since we allocated the + dinode block under the directory's reservation, we transfer + ownership of that reservation to the new inode. The directory + doesn't need a reservation unless it needs a new allocation. */ + ip->i_res = dip->i_res; + dip->i_res = NULL; + error = gfs2_acl_create(dip, inode); if (error) goto fail_gunlock2; - error = gfs2_security_init(dip, GFS2_I(inode), name); + error = gfs2_security_init(dip, ip, name); if (error) goto fail_gunlock2; - error = link_dinode(dip, name, GFS2_I(inode)); + error = link_dinode(dip, name, ip); if (error) goto fail_gunlock2; @@ -722,10 +739,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_trans_end(sdp); /* Check if we reserved space in the rgrp. Function link_dinode may not, depending on whether alloc is required. */ - if (dip->i_res) + if (gfs2_mb_reserved(dip)) gfs2_inplace_release(dip); gfs2_quota_unlock(dip); - gfs2_qadata_put(dip); mark_inode_dirty(inode); gfs2_glock_dq_uninit_m(2, ghs); d_instantiate(dentry, inode); @@ -740,6 +756,7 @@ fail_gunlock: iput(inode); } fail: + gfs2_rs_delete(dip); if (bh) brelse(bh); return error; @@ -755,11 +772,8 @@ fail: */ static int gfs2_create(struct inode *dir, struct dentry *dentry, - umode_t mode, struct nameidata *nd) + umode_t mode, bool excl) { - int excl = 0; - if (nd && (nd->flags & LOOKUP_EXCL)) - excl = 1; return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl); } @@ -775,7 +789,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry, */ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0); if (inode && !IS_ERR(inode)) { @@ -819,6 +833,10 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (S_ISDIR(inode->i_mode)) return -EPERM; + error = gfs2_rs_alloc(dip); + if (error) + return error; + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); @@ -870,16 +888,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, error = 0; if (alloc_required) { - struct gfs2_qadata *qa = gfs2_qadata_get(dip); - - if (!qa) { - error = -ENOMEM; - goto out_gunlock; - } - error = gfs2_quota_lock_check(dip); if (error) - goto out_alloc; + goto out_gunlock; error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); if (error) @@ -922,9 +933,6 @@ out_ipres: out_gunlock_q: if (alloc_required) gfs2_quota_unlock(dip); -out_alloc: - if (alloc_required) - gfs2_qadata_put(dip); out_gunlock: gfs2_glock_dq(ghs + 1); out_child: @@ -1234,6 +1242,10 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) return error; + error = gfs2_rs_alloc(ndip); + if (error) + return error; + if (odip != ndip) { error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, &r_gh); @@ -1357,16 +1369,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; if (alloc_required) { - struct gfs2_qadata *qa = gfs2_qadata_get(ndip); - - if (!qa) { - error = -ENOMEM; - goto out_gunlock; - } - error = gfs2_quota_lock_check(ndip); if (error) - goto out_alloc; + goto out_gunlock; error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres); if (error) @@ -1427,9 +1432,6 @@ out_ipreserv: out_gunlock_q: if (alloc_required) gfs2_quota_unlock(ndip); -out_alloc: - if (alloc_required) - gfs2_qadata_put(ndip); out_gunlock: while (x--) { gfs2_glock_dq(ghs + x); @@ -1590,12 +1592,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) ogid = ngid = NO_QUOTA_CHANGE; - if (!gfs2_qadata_get(ip)) - return -ENOMEM; - error = gfs2_quota_lock(ip, nuid, ngid); if (error) - goto out_alloc; + return error; if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { error = gfs2_quota_check(ip, nuid, ngid); @@ -1621,8 +1620,6 @@ out_end_trans: gfs2_trans_end(sdp); out_gunlock_q: gfs2_quota_unlock(ip); -out_alloc: - gfs2_qadata_put(ip); return error; } @@ -1644,6 +1641,10 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) struct gfs2_holder i_gh; int error; + error = gfs2_rs_alloc(ip); + if (error) + return error; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); if (error) return error; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 852c1be1dd3..8ff95a2d54e 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -401,9 +401,14 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) goto out; set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - gfs2_meta_check(sdp, bd->bd_bh); - gfs2_pin(sdp, bd->bd_bh); mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { + printk(KERN_ERR + "Attempting to add uninitialised block to journal (inplace block=%lld)\n", + (unsigned long long)bd->bd_bh->b_blocknr); + BUG(); + } + gfs2_pin(sdp, bd->bd_bh); mh->__pad0 = cpu_to_be64(0); mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); sdp->sd_log_num_buf++; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 6cdb0f2a1b0..e04d0e09ee7 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -43,7 +43,6 @@ static void gfs2_init_inode_once(void *foo) inode_init_once(&ip->i_inode); init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); - ip->i_qadata = NULL; ip->i_res = NULL; ip->i_hash_cache = NULL; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 6c1e5d1c404..3a56c8d94de 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -213,8 +213,10 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct gfs2_sbd *sdp = gl->gl_sbd; struct buffer_head *bh; - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + *bhp = NULL; return -EIO; + } *bhp = bh = gfs2_getbuf(gl, blkno, CREATE); @@ -235,6 +237,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, if (tr && tr->tr_touched) gfs2_io_error_bh(sdp, bh); brelse(bh); + *bhp = NULL; return -EIO; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b8c250fc492..e5af9dc420e 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1118,20 +1118,33 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent } error = init_names(sdp, silent); - if (error) - goto fail; + if (error) { + /* In this case, we haven't initialized sysfs, so we have to + manually free the sdp. */ + free_percpu(sdp->sd_lkstats); + kfree(sdp); + sb->s_fs_info = NULL; + return error; + } snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); - gfs2_create_debugfs_file(sdp); - error = gfs2_sys_fs_add(sdp); + /* + * If we hit an error here, gfs2_sys_fs_add will have called function + * kobject_put which causes the sysfs usage count to go to zero, which + * causes sysfs to call function gfs2_sbd_release, which frees sdp. + * Subsequent error paths here will call gfs2_sys_fs_del, which also + * kobject_put to free sdp. + */ if (error) - goto fail; + return error; + + gfs2_create_debugfs_file(sdp); error = gfs2_lm_mount(sdp, silent); if (error) - goto fail_sys; + goto fail_debug; error = init_locking(sdp, &mount_gh, DO); if (error) @@ -1215,12 +1228,12 @@ fail_locking: fail_lm: gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); -fail_sys: - gfs2_sys_fs_del(sdp); -fail: +fail_debug: gfs2_delete_debugfs_file(sdp); free_percpu(sdp->sd_lkstats); - kfree(sdp); + /* gfs2_sys_fs_del must be the last thing we do, since it causes + * sysfs to call function gfs2_sbd_release, which frees sdp. */ + gfs2_sys_fs_del(sdp); sb->s_fs_info = NULL; return error; } @@ -1286,7 +1299,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, error = -EBUSY; goto error_bdev; } - s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev); + s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); error = PTR_ERR(s); if (IS_ERR(s)) @@ -1316,7 +1329,6 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; s->s_mode = mode; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); @@ -1360,7 +1372,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, dev_name, error); return ERR_PTR(error); } - s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, + s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, path.dentry->d_inode->i_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { @@ -1390,10 +1402,9 @@ static void gfs2_kill_sb(struct super_block *sb) sdp->sd_root_dir = NULL; sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); - kill_block_super(sb); gfs2_delete_debugfs_file(sdp); free_percpu(sdp->sd_lkstats); - kfree(sdp); + kill_block_super(sb); } struct file_system_type gfs2_fs_type = { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index b97178e7d39..a3bde91645c 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -494,11 +494,15 @@ static void qdsb_put(struct gfs2_quota_data *qd) int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_qadata *qa = ip->i_qadata; - struct gfs2_quota_data **qd = qa->qa_qd; + struct gfs2_quota_data **qd; int error; - if (gfs2_assert_warn(sdp, !qa->qa_qd_num) || + if (ip->i_res == NULL) + gfs2_rs_alloc(ip); + + qd = ip->i_res->rs_qa_qd; + + if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) || gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) return -EIO; @@ -508,20 +512,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd); if (error) goto out; - qa->qa_qd_num++; + ip->i_res->rs_qa_qd_num++; qd++; error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd); if (error) goto out; - qa->qa_qd_num++; + ip->i_res->rs_qa_qd_num++; qd++; if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { error = qdsb_get(sdp, QUOTA_USER, uid, qd); if (error) goto out; - qa->qa_qd_num++; + ip->i_res->rs_qa_qd_num++; qd++; } @@ -529,7 +533,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) error = qdsb_get(sdp, QUOTA_GROUP, gid, qd); if (error) goto out; - qa->qa_qd_num++; + ip->i_res->rs_qa_qd_num++; qd++; } @@ -542,16 +546,17 @@ out: void gfs2_quota_unhold(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_qadata *qa = ip->i_qadata; unsigned int x; + if (ip->i_res == NULL) + return; gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); - for (x = 0; x < qa->qa_qd_num; x++) { - qdsb_put(qa->qa_qd[x]); - qa->qa_qd[x] = NULL; + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qdsb_put(ip->i_res->rs_qa_qd[x]); + ip->i_res->rs_qa_qd[x] = NULL; } - qa->qa_qd_num = 0; + ip->i_res->rs_qa_qd_num = 0; } static int sort_qd(const void *a, const void *b) @@ -764,6 +769,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) unsigned int nalloc = 0, blocks; int error; + error = gfs2_rs_alloc(ip); + if (error) + return error; + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); @@ -915,7 +924,6 @@ fail: int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qd; unsigned int x; int error = 0; @@ -928,15 +936,15 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - sort(qa->qa_qd, qa->qa_qd_num, sizeof(struct gfs2_quota_data *), - sort_qd, NULL); + sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num, + sizeof(struct gfs2_quota_data *), sort_qd, NULL); - for (x = 0; x < qa->qa_qd_num; x++) { + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { int force = NO_FORCE; - qd = qa->qa_qd[x]; + qd = ip->i_res->rs_qa_qd[x]; if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) force = FORCE; - error = do_glock(qd, force, &qa->qa_qd_ghs[x]); + error = do_glock(qd, force, &ip->i_res->rs_qa_qd_ghs[x]); if (error) break; } @@ -945,7 +953,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) set_bit(GIF_QD_LOCKED, &ip->i_flags); else { while (x--) - gfs2_glock_dq_uninit(&qa->qa_qd_ghs[x]); + gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); gfs2_quota_unhold(ip); } @@ -990,7 +998,6 @@ static int need_sync(struct gfs2_quota_data *qd) void gfs2_quota_unlock(struct gfs2_inode *ip) { - struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qda[4]; unsigned int count = 0; unsigned int x; @@ -998,14 +1005,14 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) goto out; - for (x = 0; x < qa->qa_qd_num; x++) { + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { struct gfs2_quota_data *qd; int sync; - qd = qa->qa_qd[x]; + qd = ip->i_res->rs_qa_qd[x]; sync = need_sync(qd); - gfs2_glock_dq_uninit(&qa->qa_qd_ghs[x]); + gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); if (sync && qd_trylock(qd)) qda[count++] = qd; @@ -1038,7 +1045,6 @@ static int print_message(struct gfs2_quota_data *qd, char *type) int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qd; s64 value; unsigned int x; @@ -1050,8 +1056,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - for (x = 0; x < qa->qa_qd_num; x++) { - qd = qa->qa_qd[x]; + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qd = ip->i_res->rs_qa_qd[x]; if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) @@ -1089,7 +1095,6 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid) { - struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qd; unsigned int x; @@ -1098,8 +1103,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; - for (x = 0; x < qa->qa_qd_num; x++) { - qd = qa->qa_qd[x]; + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qd = ip->i_res->rs_qa_qd[x]; if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { @@ -1108,7 +1113,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, } } -int gfs2_quota_sync(struct super_block *sb, int type, int wait) +int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; @@ -1154,7 +1159,7 @@ int gfs2_quota_sync(struct super_block *sb, int type, int wait) static int gfs2_quota_sync_timeo(struct super_block *sb, int type) { - return gfs2_quota_sync(sb, type, 0); + return gfs2_quota_sync(sb, type); } int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) @@ -1549,10 +1554,14 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, if (error) return error; + error = gfs2_rs_alloc(ip); + if (error) + goto out_put; + mutex_lock(&ip->i_inode.i_mutex); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); if (error) - goto out_put; + goto out_unlockput; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); if (error) goto out_q; @@ -1609,8 +1618,9 @@ out_i: gfs2_glock_dq_uninit(&i_gh); out_q: gfs2_glock_dq_uninit(&q_gh); -out_put: +out_unlockput: mutex_unlock(&ip->i_inode.i_mutex); +out_put: qd_put(qd); return error; } diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 90bf1c302a9..f25d98b8790 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -26,7 +26,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid); -extern int gfs2_quota_sync(struct super_block *sb, int type, int wait); +extern int gfs2_quota_sync(struct super_block *sb, int type); extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); extern int gfs2_quota_init(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index f74fb9bd197..4d34887a601 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -35,6 +35,9 @@ #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) +#define RSRV_CONTENTION_FACTOR 4 +#define RGRP_RSRV_MAX_CONTENDERS 2 + #if BITS_PER_LONG == 32 #define LBITMASK (0x55555555UL) #define LBITSKIP55 (0x55555555UL) @@ -178,6 +181,57 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) } /** + * rs_cmp - multi-block reservation range compare + * @blk: absolute file system block number of the new reservation + * @len: number of blocks in the new reservation + * @rs: existing reservation to compare against + * + * returns: 1 if the block range is beyond the reach of the reservation + * -1 if the block range is before the start of the reservation + * 0 if the block range overlaps with the reservation + */ +static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs) +{ + u64 startblk = gfs2_rs_startblk(rs); + + if (blk >= startblk + rs->rs_free) + return 1; + if (blk + len - 1 < startblk) + return -1; + return 0; +} + +/** + * rs_find - Find a rgrp multi-block reservation that contains a given block + * @rgd: The rgrp + * @rgblk: The block we're looking for, relative to the rgrp + */ +static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk) +{ + struct rb_node **newn; + int rc; + u64 fsblk = rgblk + rgd->rd_data0; + + spin_lock(&rgd->rd_rsspin); + newn = &rgd->rd_rstree.rb_node; + while (*newn) { + struct gfs2_blkreserv *cur = + rb_entry(*newn, struct gfs2_blkreserv, rs_node); + rc = rs_cmp(fsblk, 1, cur); + if (rc < 0) + newn = &((*newn)->rb_left); + else if (rc > 0) + newn = &((*newn)->rb_right); + else { + spin_unlock(&rgd->rd_rsspin); + return cur; + } + } + spin_unlock(&rgd->rd_rsspin); + return NULL; +} + +/** * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing * a block in a given allocation state. * @buf: the buffer that holds the bitmaps @@ -417,6 +471,137 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd) } } +/** + * gfs2_rs_alloc - make sure we have a reservation assigned to the inode + * @ip: the inode for this reservation + */ +int gfs2_rs_alloc(struct gfs2_inode *ip) +{ + int error = 0; + struct gfs2_blkreserv *res; + + if (ip->i_res) + return 0; + + res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); + if (!res) + error = -ENOMEM; + + down_write(&ip->i_rw_mutex); + if (ip->i_res) + kmem_cache_free(gfs2_rsrv_cachep, res); + else + ip->i_res = res; + up_write(&ip->i_rw_mutex); + return error; +} + +static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs) +{ + gfs2_print_dbg(seq, " r: %llu s:%llu b:%u f:%u\n", + rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk, + rs->rs_free); +} + +/** + * __rs_deltree - remove a multi-block reservation from the rgd tree + * @rs: The reservation to remove + * + */ +static void __rs_deltree(struct gfs2_blkreserv *rs) +{ + struct gfs2_rgrpd *rgd; + + if (!gfs2_rs_active(rs)) + return; + + rgd = rs->rs_rgd; + /* We can't do this: The reason is that when the rgrp is invalidated, + it's in the "middle" of acquiring the glock, but the HOLDER bit + isn't set yet: + BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/ + trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL); + + if (!RB_EMPTY_ROOT(&rgd->rd_rstree)) + rb_erase(&rs->rs_node, &rgd->rd_rstree); + BUG_ON(!rgd->rd_rs_cnt); + rgd->rd_rs_cnt--; + + if (rs->rs_free) { + /* return reserved blocks to the rgrp and the ip */ + BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free); + rs->rs_rgd->rd_reserved -= rs->rs_free; + rs->rs_free = 0; + clear_bit(GBF_FULL, &rs->rs_bi->bi_flags); + smp_mb__after_clear_bit(); + } + /* We can't change any of the step 1 or step 2 components of the rs. + E.g. We can't set rs_rgd to NULL because the rgd glock is held and + dequeued through this pointer. + Can't: atomic_set(&rs->rs_sizehint, 0); + Can't: rs->rs_requested = 0; + Can't: rs->rs_rgd = NULL;*/ + rs->rs_bi = NULL; + rs->rs_biblk = 0; +} + +/** + * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree + * @rs: The reservation to remove + * + */ +void gfs2_rs_deltree(struct gfs2_blkreserv *rs) +{ + struct gfs2_rgrpd *rgd; + + if (!gfs2_rs_active(rs)) + return; + + rgd = rs->rs_rgd; + spin_lock(&rgd->rd_rsspin); + __rs_deltree(rs); + spin_unlock(&rgd->rd_rsspin); +} + +/** + * gfs2_rs_delete - delete a multi-block reservation + * @ip: The inode for this reservation + * + */ +void gfs2_rs_delete(struct gfs2_inode *ip) +{ + down_write(&ip->i_rw_mutex); + if (ip->i_res) { + gfs2_rs_deltree(ip->i_res); + trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE); + BUG_ON(ip->i_res->rs_free); + kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); + ip->i_res = NULL; + } + up_write(&ip->i_rw_mutex); +} + +/** + * return_all_reservations - return all reserved blocks back to the rgrp. + * @rgd: the rgrp that needs its space back + * + * We previously reserved a bunch of blocks for allocation. Now we need to + * give them back. This leave the reservation structures in tact, but removes + * all of their corresponding "no-fly zones". + */ +static void return_all_reservations(struct gfs2_rgrpd *rgd) +{ + struct rb_node *n; + struct gfs2_blkreserv *rs; + + spin_lock(&rgd->rd_rsspin); + while ((n = rb_first(&rgd->rd_rstree))) { + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + __rs_deltree(rs); + } + spin_unlock(&rgd->rd_rsspin); +} + void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) { struct rb_node *n; @@ -439,6 +624,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) gfs2_free_clones(rgd); kfree(rgd->rd_bits); + return_all_reservations(rgd); kmem_cache_free(gfs2_rgrpd_cachep, rgd); } } @@ -616,6 +802,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_data0 = be64_to_cpu(buf.ri_data0); rgd->rd_data = be32_to_cpu(buf.ri_data); rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); + spin_lock_init(&rgd->rd_rsspin); error = compute_bitstructs(rgd); if (error) @@ -627,6 +814,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_gl->gl_object = rgd; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lvb; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; if (rgd->rd_data > sdp->sd_max_rg_data) sdp->sd_max_rg_data = rgd->rd_data; @@ -736,9 +924,65 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); } +static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd) +{ + struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; + struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data; + + if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free || + rgl->rl_dinodes != str->rg_dinodes || + rgl->rl_igeneration != str->rg_igeneration) + return 0; + return 1; +} + +static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + + rgl->rl_magic = cpu_to_be32(GFS2_MAGIC); + rgl->rl_flags = str->rg_flags; + rgl->rl_free = str->rg_free; + rgl->rl_dinodes = str->rg_dinodes; + rgl->rl_igeneration = str->rg_igeneration; + rgl->__pad = 0UL; +} + +static void update_rgrp_lvb_unlinked(struct gfs2_rgrpd *rgd, u32 change) +{ + struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; + u32 unlinked = be32_to_cpu(rgl->rl_unlinked) + change; + rgl->rl_unlinked = cpu_to_be32(unlinked); +} + +static u32 count_unlinked(struct gfs2_rgrpd *rgd) +{ + struct gfs2_bitmap *bi; + const u32 length = rgd->rd_length; + const u8 *buffer = NULL; + u32 i, goal, count = 0; + + for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) { + goal = 0; + buffer = bi->bi_bh->b_data + bi->bi_offset; + WARN_ON(!buffer_uptodate(bi->bi_bh)); + while (goal < bi->bi_len * GFS2_NBBY) { + goal = gfs2_bitfit(buffer, bi->bi_len, goal, + GFS2_BLKST_UNLINKED); + if (goal == BFITNOENT) + break; + count++; + goal++; + } + } + + return count; +} + + /** - * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps - * @gh: The glock holder for the resource group + * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps + * @rgd: the struct gfs2_rgrpd describing the RG to read in * * Read in all of a Resource Group's header and bitmap blocks. * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps. @@ -746,9 +990,8 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) * Returns: errno */ -int gfs2_rgrp_go_lock(struct gfs2_holder *gh) +int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) { - struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_glock *gl = rgd->rd_gl; unsigned int length = rgd->rd_length; @@ -756,6 +999,9 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh) unsigned int x, y; int error; + if (rgd->rd_bits[0].bi_bh != NULL) + return 0; + for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); @@ -782,7 +1028,20 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh) rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; } - + if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { + rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, + rgd->rd_bits[0].bi_bh->b_data); + } + else if (sdp->sd_args.ar_rgrplvb) { + if (!gfs2_rgrp_lvb_valid(rgd)){ + gfs2_consist_rgrpd(rgd); + error = -EIO; + goto fail; + } + if (rgd->rd_rgl->rl_unlinked == 0) + rgd->rd_flags &= ~GFS2_RDF_CHECK; + } return 0; fail: @@ -796,6 +1055,39 @@ fail: return error; } +int update_rgrp_lvb(struct gfs2_rgrpd *rgd) +{ + u32 rl_flags; + + if (rgd->rd_flags & GFS2_RDF_UPTODATE) + return 0; + + if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) + return gfs2_rgrp_bh_get(rgd); + + rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); + rl_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + if (rgd->rd_rgl->rl_unlinked == 0) + rgd->rd_flags &= ~GFS2_RDF_CHECK; + rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free); + rgd->rd_free_clone = rgd->rd_free; + rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes); + rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration); + return 0; +} + +int gfs2_rgrp_go_lock(struct gfs2_holder *gh) +{ + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; + struct gfs2_sbd *sdp = rgd->rd_sbd; + + if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) + return 0; + return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object); +} + /** * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get() * @gh: The glock holder for the resource group @@ -809,8 +1101,10 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) for (x = 0; x < length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; - brelse(bi->bi_bh); - bi->bi_bh = NULL; + if (bi->bi_bh) { + brelse(bi->bi_bh); + bi->bi_bh = NULL; + } } } @@ -954,6 +1248,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) rgd->rd_flags |= GFS2_RGF_TRIMMED; gfs2_trans_add_bh(rgd->rd_gl, bh, 1); gfs2_rgrp_out(rgd, bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data); gfs2_trans_end(sdp); } } @@ -974,38 +1269,184 @@ out: } /** - * gfs2_qadata_get - get the struct gfs2_qadata structure for an inode - * @ip: the incore GFS2 inode structure + * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree + * @bi: the bitmap with the blocks + * @ip: the inode structure + * @biblk: the 32-bit block number relative to the start of the bitmap + * @amount: the number of blocks to reserve * - * Returns: the struct gfs2_qadata + * Returns: NULL - reservation was already taken, so not inserted + * pointer to the inserted reservation */ +static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi, + struct gfs2_inode *ip, u32 biblk, + int amount) +{ + struct rb_node **newn, *parent = NULL; + int rc; + struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_rgrpd *rgd = rs->rs_rgd; + u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0; -struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip) + spin_lock(&rgd->rd_rsspin); + newn = &rgd->rd_rstree.rb_node; + BUG_ON(!ip->i_res); + BUG_ON(gfs2_rs_active(rs)); + /* Figure out where to put new node */ + /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/ + while (*newn) { + struct gfs2_blkreserv *cur = + rb_entry(*newn, struct gfs2_blkreserv, rs_node); + + parent = *newn; + rc = rs_cmp(fsblock, amount, cur); + if (rc > 0) + newn = &((*newn)->rb_right); + else if (rc < 0) + newn = &((*newn)->rb_left); + else { + spin_unlock(&rgd->rd_rsspin); + return NULL; /* reservation already in use */ + } + } + + /* Do our reservation work */ + rs = ip->i_res; + rs->rs_free = amount; + rs->rs_biblk = biblk; + rs->rs_bi = bi; + rb_link_node(&rs->rs_node, parent, newn); + rb_insert_color(&rs->rs_node, &rgd->rd_rstree); + + /* Do our inode accounting for the reservation */ + /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/ + + /* Do our rgrp accounting for the reservation */ + rgd->rd_reserved += amount; /* blocks reserved */ + rgd->rd_rs_cnt++; /* number of in-tree reservations */ + spin_unlock(&rgd->rd_rsspin); + trace_gfs2_rs(ip, rs, TRACE_RS_INSERT); + return rs; +} + +/** + * unclaimed_blocks - return number of blocks that aren't spoken for + */ +static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - int error; - BUG_ON(ip->i_qadata != NULL); - ip->i_qadata = kzalloc(sizeof(struct gfs2_qadata), GFP_NOFS); - error = gfs2_rindex_update(sdp); - if (error) - fs_warn(sdp, "rindex update returns %d\n", error); - return ip->i_qadata; + return rgd->rd_free_clone - rgd->rd_reserved; } /** - * gfs2_blkrsv_get - get the struct gfs2_blkreserv structure for an inode - * @ip: the incore GFS2 inode structure + * rg_mblk_search - find a group of multiple free blocks + * @rgd: the resource group descriptor + * @rs: the block reservation + * @ip: pointer to the inode for which we're reserving blocks * - * Returns: the struct gfs2_qadata + * This is very similar to rgblk_search, except we're looking for whole + * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing + * on aligned dwords for speed's sake. + * + * Returns: 0 if successful or BFITNOENT if there isn't enough free space */ -static int gfs2_blkrsv_get(struct gfs2_inode *ip) +static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { - BUG_ON(ip->i_res != NULL); - ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); - if (!ip->i_res) - return -ENOMEM; - return 0; + struct gfs2_bitmap *bi = rgd->rd_bits; + const u32 length = rgd->rd_length; + u32 blk; + unsigned int buf, x, search_bytes; + u8 *buffer = NULL; + u8 *ptr, *end, *nonzero; + u32 goal, rsv_bytes; + struct gfs2_blkreserv *rs; + u32 best_rs_bytes, unclaimed; + int best_rs_blocks; + + /* Find bitmap block that contains bits for goal block */ + if (rgrp_contains_block(rgd, ip->i_goal)) + goal = ip->i_goal - rgd->rd_data0; + else + goal = rgd->rd_last_alloc; + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + /* Convert scope of "goal" from rgrp-wide to within + found bit block */ + if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) { + goal -= bi->bi_start * GFS2_NBBY; + goto do_search; + } + } + buf = 0; + goal = 0; + +do_search: + best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint), + (RGRP_RSRV_MINBLKS * rgd->rd_length)); + best_rs_bytes = (best_rs_blocks * + (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) / + GFS2_NBBY; /* 1 + is for our not-yet-created reservation */ + best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64)); + unclaimed = unclaimed_blocks(rgd); + if (best_rs_bytes * GFS2_NBBY > unclaimed) + best_rs_bytes = unclaimed >> GFS2_BIT_SIZE; + + for (x = 0; x <= length; x++) { + bi = rgd->rd_bits + buf; + + if (test_bit(GBF_FULL, &bi->bi_flags)) + goto skip; + + WARN_ON(!buffer_uptodate(bi->bi_bh)); + if (bi->bi_clone) + buffer = bi->bi_clone + bi->bi_offset; + else + buffer = bi->bi_bh->b_data + bi->bi_offset; + + /* We have to keep the reservations aligned on u64 boundaries + otherwise we could get situations where a byte can't be + used because it's after a reservation, but a free bit still + is within the reservation's area. */ + ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64)); + end = (buffer + bi->bi_len); + while (ptr < end) { + rsv_bytes = 0; + if ((ptr + best_rs_bytes) <= end) + search_bytes = best_rs_bytes; + else + search_bytes = end - ptr; + BUG_ON(!search_bytes); + nonzero = memchr_inv(ptr, 0, search_bytes); + /* If the lot is all zeroes, reserve the whole size. If + there's enough zeroes to satisfy the request, use + what we can. If there's not enough, keep looking. */ + if (nonzero == NULL) + rsv_bytes = search_bytes; + else if ((nonzero - ptr) * GFS2_NBBY >= + ip->i_res->rs_requested) + rsv_bytes = (nonzero - ptr); + + if (rsv_bytes) { + blk = ((ptr - buffer) * GFS2_NBBY); + BUG_ON(blk >= bi->bi_len * GFS2_NBBY); + rs = rs_insert(bi, ip, blk, + rsv_bytes * GFS2_NBBY); + if (IS_ERR(rs)) + return PTR_ERR(rs); + if (rs) + return 0; + } + ptr += ALIGN(search_bytes, sizeof(u64)); + } +skip: + /* Try next bitmap block (wrap back to rgrp header + if at end) */ + buf++; + buf %= length; + goal = 0; + } + + return BFITNOENT; } /** @@ -1014,24 +1455,26 @@ static int gfs2_blkrsv_get(struct gfs2_inode *ip) * @ip: the inode * * If there's room for the requested blocks to be allocated from the RG: + * This will try to get a multi-block reservation first, and if that doesn't + * fit, it will take what it can. * * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) */ -static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip) +static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { - const struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = ip->i_res; if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) return 0; - if (rgd->rd_free_clone >= rs->rs_requested) + /* Look for a multi-block reservation. */ + if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS && + rg_mblk_search(rgd, ip) != BFITNOENT) + return 1; + if (unclaimed_blocks(rgd) >= rs->rs_requested) return 1; - return 0; -} -static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk) -{ - return (bi->bi_start * GFS2_NBBY) + blk; + return 0; } /** @@ -1101,119 +1544,120 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip } /** - * get_local_rgrp - Choose and lock a rgrp for allocation + * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for - * @last_unlinked: the last unlinked block - * - * Try to acquire rgrp in way which avoids contending with others. + * @requested: the number of blocks to be reserved * * Returns: errno */ -static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) +int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrpd *rgd, *begin = NULL; + struct gfs2_rgrpd *begin = NULL; struct gfs2_blkreserv *rs = ip->i_res; - int error, rg_locked, flags = LM_FLAG_TRY; + int error = 0, rg_locked, flags = LM_FLAG_TRY; + u64 last_unlinked = NO_BLOCK; int loops = 0; - if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) - rgd = begin = ip->i_rgd; - else - rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); - - if (rgd == NULL) + if (sdp->sd_args.ar_rgrplvb) + flags |= GL_SKIP; + rs->rs_requested = requested; + if (gfs2_assert_warn(sdp, requested)) { + error = -EINVAL; + goto out; + } + if (gfs2_rs_active(rs)) { + begin = rs->rs_rgd; + flags = 0; /* Yoda: Do or do not. There is no try */ + } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { + rs->rs_rgd = begin = ip->i_rgd; + } else { + rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); + } + if (rs->rs_rgd == NULL) return -EBADSLT; while (loops < 3) { rg_locked = 0; - if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { + if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) { rg_locked = 1; error = 0; + } else if (!loops && !gfs2_rs_active(rs) && + rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) { + /* If the rgrp already is maxed out for contenders, + we can eliminate it as a "first pass" without even + requesting the rgrp glock. */ + error = GLR_TRYFAILED; } else { - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, - flags, &rs->rs_rgd_gh); + error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl, + LM_ST_EXCLUSIVE, flags, + &rs->rs_rgd_gh); + if (!error && sdp->sd_args.ar_rgrplvb) { + error = update_rgrp_lvb(rs->rs_rgd); + if (error) { + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); + return error; + } + } } switch (error) { case 0: - if (try_rgrp_fit(rgd, ip)) { - ip->i_rgd = rgd; + if (gfs2_rs_active(rs)) { + if (unclaimed_blocks(rs->rs_rgd) + + rs->rs_free >= rs->rs_requested) { + ip->i_rgd = rs->rs_rgd; + return 0; + } + /* We have a multi-block reservation, but the + rgrp doesn't have enough free blocks to + satisfy the request. Free the reservation + and look for a suitable rgrp. */ + gfs2_rs_deltree(rs); + } + if (try_rgrp_fit(rs->rs_rgd, ip)) { + if (sdp->sd_args.ar_rgrplvb) + gfs2_rgrp_bh_get(rs->rs_rgd); + ip->i_rgd = rs->rs_rgd; return 0; } - if (rgd->rd_flags & GFS2_RDF_CHECK) - try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); + if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) { + if (sdp->sd_args.ar_rgrplvb) + gfs2_rgrp_bh_get(rs->rs_rgd); + try_rgrp_unlink(rs->rs_rgd, &last_unlinked, + ip->i_no_addr); + } if (!rg_locked) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); /* fall through */ case GLR_TRYFAILED: - rgd = gfs2_rgrpd_get_next(rgd); - if (rgd == begin) { - flags = 0; - loops++; - } + rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd); + rs->rs_rgd = rs->rs_rgd ? : begin; /* if NULL, wrap */ + if (rs->rs_rgd != begin) /* If we didn't wrap */ + break; + + flags &= ~LM_FLAG_TRY; + loops++; + /* Check that fs hasn't grown if writing to rindex */ + if (ip == GFS2_I(sdp->sd_rindex) && + !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + goto out; + } else if (loops == 2) + /* Flushing the log may release space */ + gfs2_log_flush(sdp, NULL); break; default: - return error; + goto out; } } - - return -ENOSPC; -} - -static void gfs2_blkrsv_put(struct gfs2_inode *ip) -{ - BUG_ON(ip->i_res == NULL); - kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); - ip->i_res = NULL; -} - -/** - * gfs2_inplace_reserve - Reserve space in the filesystem - * @ip: the inode to reserve space for - * @requested: the number of blocks to be reserved - * - * Returns: errno - */ - -int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_blkreserv *rs; - int error; - u64 last_unlinked = NO_BLOCK; - int tries = 0; - - error = gfs2_blkrsv_get(ip); - if (error) - return error; - - rs = ip->i_res; - rs->rs_requested = requested; - if (gfs2_assert_warn(sdp, requested)) { - error = -EINVAL; - goto out; - } - - do { - error = get_local_rgrp(ip, &last_unlinked); - if (error != -ENOSPC) - break; - /* Check that fs hasn't grown if writing to rindex */ - if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { - error = gfs2_ri_update(ip); - if (error) - break; - continue; - } - /* Flushing the log may release space */ - gfs2_log_flush(sdp, NULL); - } while (tries++ < 3); + error = -ENOSPC; out: if (error) - gfs2_blkrsv_put(ip); + rs->rs_requested = 0; return error; } @@ -1228,9 +1672,15 @@ void gfs2_inplace_release(struct gfs2_inode *ip) { struct gfs2_blkreserv *rs = ip->i_res; + if (!rs) + return; + + if (!rs->rs_free) + gfs2_rs_deltree(rs); + if (rs->rs_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); - gfs2_blkrsv_put(ip); + rs->rs_requested = 0; } /** @@ -1326,7 +1776,27 @@ do_search: if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) buffer = bi->bi_clone + bi->bi_offset; - biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state); + while (1) { + struct gfs2_blkreserv *rs; + u32 rgblk; + + biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state); + if (biblk == BFITNOENT) + break; + /* Check if this block is reserved() */ + rgblk = gfs2_bi2rgd_blk(bi, biblk); + rs = rs_find(rgd, rgblk); + if (rs == NULL) + break; + + BUG_ON(rs->rs_bi != bi); + biblk = BFITNOENT; + /* This should jump to the first block after the + reservation. */ + goal = rs->rs_biblk + rs->rs_free; + if (goal >= bi->bi_len * GFS2_NBBY) + break; + } if (biblk != BFITNOENT) break; @@ -1362,8 +1832,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi, u32 blk, bool dinode, unsigned int *n) { const unsigned int elen = *n; - u32 goal; + u32 goal, rgblk; const u8 *buffer = NULL; + struct gfs2_blkreserv *rs; *n = 0; buffer = bi->bi_bh->b_data + bi->bi_offset; @@ -1376,6 +1847,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi, goal++; if (goal >= (bi->bi_len * GFS2_NBBY)) break; + rgblk = gfs2_bi2rgd_blk(bi, goal); + rs = rs_find(rgd, rgblk); + if (rs) /* Oops, we bumped into someone's reservation */ + break; if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != GFS2_BLKST_FREE) break; @@ -1451,12 +1926,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) { - const struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_blkreserv *trs; + const struct rb_node *n; + if (rgd == NULL) return 0; - gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n", + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n", (unsigned long long)rgd->rd_addr, rgd->rd_flags, - rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes); + rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, + rgd->rd_reserved); + spin_lock(&rgd->rd_rsspin); + for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { + trs = rb_entry(n, struct gfs2_blkreserv, rs_node); + dump_rs(seq, trs); + } + spin_unlock(&rgd->rd_rsspin); return 0; } @@ -1471,10 +1956,63 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) } /** + * claim_reserved_blks - Claim previously reserved blocks + * @ip: the inode that's claiming the reservation + * @dinode: 1 if this block is a dinode block, otherwise data block + * @nblocks: desired extent length + * + * Lay claim to previously allocated block reservation blocks. + * Returns: Starting block number of the blocks claimed. + * Sets *nblocks to the actual extent length allocated. + */ +static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode, + unsigned int *nblocks) +{ + struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_rgrpd *rgd = rs->rs_rgd; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_bitmap *bi; + u64 start_block = gfs2_rs_startblk(rs); + const unsigned int elen = *nblocks; + + /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/ + gfs2_assert_withdraw(sdp, rgd); + /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/ + bi = rs->rs_bi; + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + + for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) { + /* Make sure the bitmap hasn't changed */ + gfs2_setbit(rgd, bi->bi_clone, bi, rs->rs_biblk, + dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + rs->rs_biblk++; + rs->rs_free--; + + BUG_ON(!rgd->rd_reserved); + rgd->rd_reserved--; + dinode = false; + trace_gfs2_rs(ip, rs, TRACE_RS_CLAIM); + } + + if (!rs->rs_free) { + struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd; + + gfs2_rs_deltree(rs); + /* -nblocks because we haven't returned to do the math yet. + I'm doing the math backwards to prevent negative numbers, + but think of it as: + if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */ + if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks) + rg_mblk_search(rgd, ip); + } + return start_block; +} + +/** * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode * @ip: the inode to allocate the block for * @bn: Used to return the starting block number - * @ndata: requested number of blocks/extent length (value/result) + * @nblocks: requested number of blocks/extent length (value/result) * @dinode: 1 if we're allocating a dinode block, else 0 * @generation: the generation number of the inode * @@ -1496,23 +2034,37 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, /* Only happens if there is a bug in gfs2, return something distinctive * to ensure that it is noticed. */ - if (ip->i_res == NULL) + if (ip->i_res->rs_requested == 0) return -ECANCELED; - rgd = ip->i_rgd; - - if (!dinode && rgrp_contains_block(rgd, ip->i_goal)) - goal = ip->i_goal - rgd->rd_data0; - else - goal = rgd->rd_last_alloc; - - blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi); + /* Check if we have a multi-block reservation, and if so, claim the + next free block from it. */ + if (gfs2_rs_active(ip->i_res)) { + BUG_ON(!ip->i_res->rs_free); + rgd = ip->i_res->rs_rgd; + block = claim_reserved_blks(ip, dinode, nblocks); + } else { + rgd = ip->i_rgd; - /* Since all blocks are reserved in advance, this shouldn't happen */ - if (blk == BFITNOENT) - goto rgrp_error; + if (!dinode && rgrp_contains_block(rgd, ip->i_goal)) + goal = ip->i_goal - rgd->rd_data0; + else + goal = rgd->rd_last_alloc; + + blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi); + + /* Since all blocks are reserved in advance, this shouldn't + happen */ + if (blk == BFITNOENT) { + printk(KERN_WARNING "BFITNOENT, nblocks=%u\n", + *nblocks); + printk(KERN_WARNING "FULL=%d\n", + test_bit(GBF_FULL, &rgd->rd_bits->bi_flags)); + goto rgrp_error; + } - block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks); + block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks); + } ndata = *nblocks; if (dinode) ndata--; @@ -1529,8 +2081,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, brelse(dibh); } } - if (rgd->rd_free < *nblocks) + if (rgd->rd_free < *nblocks) { + printk(KERN_WARNING "nblocks=%u\n", *nblocks); goto rgrp_error; + } rgd->rd_free -= *nblocks; if (dinode) { @@ -1542,6 +2096,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); if (dinode) @@ -1588,6 +2143,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) rgd->rd_flags &= ~GFS2_RGF_TRIMMED; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); /* Directories keep their data in the metadata address space */ if (meta || ip->i_depth) @@ -1624,6 +2180,8 @@ void gfs2_unlink_di(struct inode *inode) trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + update_rgrp_lvb_unlinked(rgd, 1); } static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) @@ -1643,6 +2201,8 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + update_rgrp_lvb_unlinked(rgd, -1); gfs2_statfs_change(sdp, 0, +1, -1); } @@ -1784,6 +2344,7 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_uninit(&rlist->rl_ghs[x]); kfree(rlist->rl_ghs); + rlist->rl_ghs = NULL; } } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index b4b10f4de25..ca6e26729b8 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -13,6 +13,14 @@ #include <linux/slab.h> #include <linux/uaccess.h> +/* Since each block in the file system is represented by two bits in the + * bitmap, one 64-bit word in the bitmap will represent 32 blocks. + * By reserving 32 blocks at a time, we can optimize / shortcut how we search + * through the bitmaps by looking a word at a time. + */ +#define RGRP_RSRV_MINBYTES 8 +#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY)) + struct gfs2_rgrpd; struct gfs2_sbd; struct gfs2_holder; @@ -29,13 +37,7 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); -extern struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip); -static inline void gfs2_qadata_put(struct gfs2_inode *ip) -{ - BUG_ON(ip->i_qadata == NULL); - kfree(ip->i_qadata); - ip->i_qadata = NULL; -} +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested); extern void gfs2_inplace_release(struct gfs2_inode *ip); @@ -43,6 +45,9 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); +extern int gfs2_rs_alloc(struct gfs2_inode *ip); +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); +extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); @@ -68,4 +73,30 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); extern int gfs2_fitrim(struct file *filp, void __user *argp); +/* This is how to tell if a multi-block reservation is "inplace" reserved: */ +static inline int gfs2_mb_reserved(struct gfs2_inode *ip) +{ + if (ip->i_res && ip->i_res->rs_requested) + return 1; + return 0; +} + +/* This is how to tell if a multi-block reservation is in the rgrp tree: */ +static inline int gfs2_rs_active(struct gfs2_blkreserv *rs) +{ + if (rs && rs->rs_bi) + return 1; + return 0; +} + +static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk) +{ + return (bi->bi_start * GFS2_NBBY) + blk; +} + +static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs) +{ + return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0; +} + #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 713e621c240..fc3168f47a1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -78,6 +78,8 @@ enum { Opt_quota_quantum, Opt_barrier, Opt_nobarrier, + Opt_rgrplvb, + Opt_norgrplvb, Opt_error, }; @@ -115,6 +117,8 @@ static const match_table_t tokens = { {Opt_quota_quantum, "quota_quantum=%d"}, {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, + {Opt_rgrplvb, "rgrplvb"}, + {Opt_norgrplvb, "norgrplvb"}, {Opt_error, NULL} }; @@ -267,6 +271,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) case Opt_nobarrier: args->ar_nobarrier = 1; break; + case Opt_rgrplvb: + args->ar_rgrplvb = 1; + break; + case Opt_norgrplvb: + args->ar_rgrplvb = 0; + break; case Opt_error: default: printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o); @@ -838,7 +848,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) int error; flush_workqueue(gfs2_delete_workqueue); - gfs2_quota_sync(sdp->sd_vfs, 0, 1); + gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, @@ -952,6 +962,8 @@ restart: static int gfs2_sync_fs(struct super_block *sb, int wait) { struct gfs2_sbd *sdp = sb->s_fs_info; + + gfs2_quota_sync(sb, -1); if (wait && sdp) gfs2_log_flush(sdp, NULL); return 0; @@ -1379,6 +1391,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",nobarrier"); if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) seq_printf(s, ",demote_interface_used"); + if (args->ar_rgrplvb) + seq_printf(s, ",rgrplvb"); return 0; } @@ -1399,7 +1413,6 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip) static int gfs2_dinode_dealloc(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_qadata *qa; struct gfs2_rgrpd *rgd; struct gfs2_holder gh; int error; @@ -1409,13 +1422,13 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) return -EIO; } - qa = gfs2_qadata_get(ip); - if (!qa) - return -ENOMEM; + error = gfs2_rindex_update(sdp); + if (error) + return error; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) - goto out; + return error; rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); if (!rgd) { @@ -1443,8 +1456,6 @@ out_rg_gunlock: gfs2_glock_dq_uninit(&gh); out_qs: gfs2_quota_unhold(ip); -out: - gfs2_qadata_put(ip); return error; } @@ -1545,6 +1556,9 @@ out_truncate: out_unlock: /* Error path for case 1 */ + if (gfs2_rs_active(ip->i_res)) + gfs2_rs_deltree(ip->i_res); + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) gfs2_glock_dq(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); @@ -1554,6 +1568,7 @@ out_unlock: out: /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); + gfs2_rs_delete(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; @@ -1576,6 +1591,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) ip->i_flags = 0; ip->i_gl = NULL; ip->i_rgd = NULL; + ip->i_res = NULL; } return &ip->i_inode; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 9c2592b1d5f..8056b7b7238 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -168,7 +168,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_quota_sync(sdp->sd_vfs, 0, 1); + gfs2_quota_sync(sdp->sd_vfs, 0); return len; } @@ -276,7 +276,15 @@ static struct attribute *gfs2_attrs[] = { NULL, }; +static void gfs2_sbd_release(struct kobject *kobj) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + + kfree(sdp); +} + static struct kobj_type gfs2_ktype = { + .release = gfs2_sbd_release, .default_attrs = gfs2_attrs, .sysfs_ops = &gfs2_attr_ops, }; @@ -583,6 +591,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) char ro[20]; char spectator[20]; char *envp[] = { ro, spectator, NULL }; + int sysfs_frees_sdp = 0; sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); @@ -591,8 +600,10 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, "%s", sdp->sd_table_name); if (error) - goto fail; + goto fail_reg; + sysfs_frees_sdp = 1; /* Freeing sdp is now done by sysfs calling + function gfs2_sbd_release. */ error = sysfs_create_group(&sdp->sd_kobj, &tune_group); if (error) goto fail_reg; @@ -615,9 +626,13 @@ fail_lock_module: fail_tune: sysfs_remove_group(&sdp->sd_kobj, &tune_group); fail_reg: - kobject_put(&sdp->sd_kobj); -fail: + free_percpu(sdp->sd_lkstats); fs_err(sdp, "error %d adding sysfs files", error); + if (sysfs_frees_sdp) + kobject_put(&sdp->sd_kobj); + else + kfree(sdp); + sb->s_fs_info = NULL; return error; } diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 1b8b8158819..a25c252fe41 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -14,6 +14,7 @@ #include <linux/ktime.h> #include "incore.h" #include "glock.h" +#include "rgrp.h" #define dlm_state_name(nn) { DLM_LOCK_##nn, #nn } #define glock_trace_name(x) __print_symbolic(x, \ @@ -31,6 +32,17 @@ { GFS2_BLKST_DINODE, "dinode" }, \ { GFS2_BLKST_UNLINKED, "unlinked" }) +#define TRACE_RS_DELETE 0 +#define TRACE_RS_TREEDEL 1 +#define TRACE_RS_INSERT 2 +#define TRACE_RS_CLAIM 3 + +#define rs_func_name(x) __print_symbolic(x, \ + { 0, "del " }, \ + { 1, "tdel" }, \ + { 2, "ins " }, \ + { 3, "clm " }) + #define show_glock_flags(flags) __print_flags(flags, "", \ {(1UL << GLF_LOCK), "l" }, \ {(1UL << GLF_DEMOTE), "D" }, \ @@ -470,6 +482,7 @@ TRACE_EVENT(gfs2_block_alloc, __field( u8, block_state ) __field( u64, rd_addr ) __field( u32, rd_free_clone ) + __field( u32, rd_reserved ) ), TP_fast_assign( @@ -480,16 +493,58 @@ TRACE_EVENT(gfs2_block_alloc, __entry->block_state = block_state; __entry->rd_addr = rgd->rd_addr; __entry->rd_free_clone = rgd->rd_free_clone; + __entry->rd_reserved = rgd->rd_reserved; ), - TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u", + TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->inum, (unsigned long long)__entry->start, (unsigned long)__entry->len, block_state_name(__entry->block_state), (unsigned long long)__entry->rd_addr, - __entry->rd_free_clone) + __entry->rd_free_clone, (unsigned long)__entry->rd_reserved) +); + +/* Keep track of multi-block reservations as they are allocated/freed */ +TRACE_EVENT(gfs2_rs, + + TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs, + u8 func), + + TP_ARGS(ip, rs, func), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, rd_addr ) + __field( u32, rd_free_clone ) + __field( u32, rd_reserved ) + __field( u64, inum ) + __field( u64, start ) + __field( u32, free ) + __field( u8, func ) + ), + + TP_fast_assign( + __entry->dev = rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0; + __entry->rd_addr = rs->rs_rgd ? rs->rs_rgd->rd_addr : 0; + __entry->rd_free_clone = rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0; + __entry->rd_reserved = rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0; + __entry->inum = ip ? ip->i_no_addr : 0; + __entry->start = gfs2_rs_startblk(rs); + __entry->free = rs->rs_free; + __entry->func = func; + ), + + TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s " + "f:%lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->start, + (unsigned long long)__entry->rd_addr, + (unsigned long)__entry->rd_free_clone, + (unsigned long)__entry->rd_reserved, + rs_func_name(__entry->func), (unsigned long)__entry->free) ); #endif /* _TRACE_GFS2_H */ diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 125d4572e1c..41f42cdccbb 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -31,7 +31,7 @@ struct gfs2_glock; static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip) { const struct gfs2_blkreserv *rs = ip->i_res; - if (rs->rs_requested < ip->i_rgd->rd_length) + if (rs && rs->rs_requested < ip->i_rgd->rd_length) return rs->rs_requested + 1; return ip->i_rgd->rd_length; } diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 3586b0dd6aa..80535739ac7 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -79,23 +79,19 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *type, const char *function, char *file, unsigned int line); -static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp, - struct buffer_head *bh, - const char *function, - char *file, unsigned int line) +static inline int gfs2_meta_check(struct gfs2_sbd *sdp, + struct buffer_head *bh) { struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; u32 magic = be32_to_cpu(mh->mh_magic); - if (unlikely(magic != GFS2_MAGIC)) - return gfs2_meta_check_ii(sdp, bh, "magic number", function, - file, line); + if (unlikely(magic != GFS2_MAGIC)) { + printk(KERN_ERR "GFS2: Magic number missing at %llu\n", + (unsigned long long)bh->b_blocknr); + return -EIO; + } return 0; } -#define gfs2_meta_check(sdp, bh) \ -gfs2_meta_check_i((sdp), (bh), __func__, __FILE__, __LINE__) - - int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, u16 type, u16 t, const char *function, diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 927f4df874a..27a0b4a901f 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -325,12 +325,11 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, int leave) { - struct gfs2_qadata *qa; int error; - qa = gfs2_qadata_get(ip); - if (!qa) - return -ENOMEM; + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); + if (error) + return error; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) @@ -340,7 +339,6 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, gfs2_quota_unhold(ip); out_alloc: - gfs2_qadata_put(ip); return error; } @@ -713,17 +711,16 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, unsigned int blks, ea_skeleton_call_t skeleton_call, void *private) { - struct gfs2_qadata *qa; struct buffer_head *dibh; int error; - qa = gfs2_qadata_get(ip); - if (!qa) - return -ENOMEM; + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); + if (error) + return error; error = gfs2_quota_lock_check(ip); if (error) - goto out; + return error; error = gfs2_inplace_reserve(ip, blks); if (error) @@ -753,8 +750,6 @@ out_ipres: gfs2_inplace_release(ip); out_gunlock_q: gfs2_quota_unlock(ip); -out: - gfs2_qadata_put(ip); return error; } @@ -1494,16 +1489,15 @@ out_gunlock: int gfs2_ea_dealloc(struct gfs2_inode *ip) { - struct gfs2_qadata *qa; int error; - qa = gfs2_qadata_get(ip); - if (!qa) - return -ENOMEM; + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); + if (error) + return error; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); if (error) - goto out_alloc; + return error; error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); if (error) @@ -1519,8 +1513,6 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) out_quota: gfs2_quota_unhold(ip); -out_alloc: - gfs2_qadata_put(ip); return error; } diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 62fc14ea4b7..422dde2ec0a 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -18,7 +18,7 @@ * hfs_lookup() */ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { hfs_cat_rec rec; struct hfs_find_data fd; @@ -187,7 +187,7 @@ static int hfs_dir_release(struct inode *inode, struct file *file) * the directory and the name (and its length) of the new file. */ static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct inode *inode; int res; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 2c16316d291..a67955a0c36 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -432,7 +432,7 @@ out: if (inode->i_ino < HFS_FIRSTUSER_CNID) set_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags); set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); - sb->s_dirt = 1; + hfs_mark_mdb_dirty(sb); } return res; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 1bf967c6bfd..8275175acf6 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/fs.h> +#include <linux/workqueue.h> #include <asm/byteorder.h> #include <asm/uaccess.h> @@ -137,16 +138,15 @@ struct hfs_sb_info { gid_t s_gid; /* The gid of all files */ int session, part; - struct nls_table *nls_io, *nls_disk; - struct mutex bitmap_lock; - unsigned long flags; - u16 blockoffset; - int fs_div; + struct super_block *sb; + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work mdb_work; /* MDB flush delayed work */ + spinlock_t work_lock; /* protects mdb_work and work_queued */ }; #define HFS_FLG_BITMAP_DIRTY 0 @@ -226,6 +226,9 @@ extern int hfs_compare_dentry(const struct dentry *parent, extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *); extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *); +/* super.c */ +extern void hfs_mark_mdb_dirty(struct super_block *sb); + extern struct timezone sys_tz; /* @@ -253,7 +256,7 @@ static inline const char *hfs_mdb_name(struct super_block *sb) static inline void hfs_bitmap_dirty(struct super_block *sb) { set_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags); - sb->s_dirt = 1; + hfs_mark_mdb_dirty(sb); } #define sb_bread512(sb, sec, data) ({ \ diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 761ec06354b..ee1bc55677f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -220,7 +220,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode) insert_inode_hash(inode); mark_inode_dirty(inode); set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); - sb->s_dirt = 1; + hfs_mark_mdb_dirty(sb); return inode; } @@ -235,7 +235,7 @@ void hfs_delete_inode(struct inode *inode) if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) HFS_SB(sb)->root_dirs--; set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); - sb->s_dirt = 1; + hfs_mark_mdb_dirty(sb); return; } HFS_SB(sb)->file_count--; @@ -248,7 +248,7 @@ void hfs_delete_inode(struct inode *inode) } } set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); - sb->s_dirt = 1; + hfs_mark_mdb_dirty(sb); } void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, @@ -489,7 +489,7 @@ out: } static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode = NULL; hfs_cat_rec rec; @@ -644,13 +644,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, /* sync the superblock to buffers */ sb = inode->i_sb; - if (sb->s_dirt) { - lock_super(sb); - sb->s_dirt = 0; - if (!(sb->s_flags & MS_RDONLY)) - hfs_mdb_commit(sb); - unlock_super(sb); - } + flush_delayed_work_sync(&HFS_SB(sb)->mdb_work); /* .. finally sync the buffers to disk */ err = sync_blockdev(sb->s_bdev); if (!ret) diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 1563d5ce576..5fd51a5833f 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -260,6 +260,10 @@ void hfs_mdb_commit(struct super_block *sb) { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; + if (sb->s_flags & MS_RDONLY) + return; + + lock_buffer(HFS_SB(sb)->mdb_bh); if (test_and_clear_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags)) { /* These parameters may have been modified, so write them back */ mdb->drLsMod = hfs_mtime(); @@ -283,9 +287,13 @@ void hfs_mdb_commit(struct super_block *sb) &mdb->drXTFlSize, NULL); hfs_inode_write_fork(HFS_SB(sb)->cat_tree->inode, mdb->drCTExtRec, &mdb->drCTFlSize, NULL); + + lock_buffer(HFS_SB(sb)->alt_mdb_bh); memcpy(HFS_SB(sb)->alt_mdb, HFS_SB(sb)->mdb, HFS_SECTOR_SIZE); HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); + unlock_buffer(HFS_SB(sb)->alt_mdb_bh); + mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh); } @@ -308,7 +316,11 @@ void hfs_mdb_commit(struct super_block *sb) break; } len = min((int)sb->s_blocksize - off, size); + + lock_buffer(bh); memcpy(bh->b_data + off, ptr, len); + unlock_buffer(bh); + mark_buffer_dirty(bh); brelse(bh); block++; @@ -317,6 +329,7 @@ void hfs_mdb_commit(struct super_block *sb) size -= len; } } + unlock_buffer(HFS_SB(sb)->mdb_bh); } void hfs_mdb_close(struct super_block *sb) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 7b4c537d6e1..4eb873e0c07 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -29,43 +29,9 @@ static struct kmem_cache *hfs_inode_cachep; MODULE_LICENSE("GPL"); -/* - * hfs_write_super() - * - * Description: - * This function is called by the VFS only. When the filesystem - * is mounted r/w it updates the MDB on disk. - * Input Variable(s): - * struct super_block *sb: Pointer to the hfs superblock - * Output Variable(s): - * NONE - * Returns: - * void - * Preconditions: - * 'sb' points to a "valid" (struct super_block). - * Postconditions: - * The MDB is marked 'unsuccessfully unmounted' by clearing bit 8 of drAtrb - * (hfs_put_super() must set this flag!). Some MDB fields are updated - * and the MDB buffer is written to disk by calling hfs_mdb_commit(). - */ -static void hfs_write_super(struct super_block *sb) -{ - lock_super(sb); - sb->s_dirt = 0; - - /* sync everything to the buffers */ - if (!(sb->s_flags & MS_RDONLY)) - hfs_mdb_commit(sb); - unlock_super(sb); -} - static int hfs_sync_fs(struct super_block *sb, int wait) { - lock_super(sb); hfs_mdb_commit(sb); - sb->s_dirt = 0; - unlock_super(sb); - return 0; } @@ -78,13 +44,44 @@ static int hfs_sync_fs(struct super_block *sb, int wait) */ static void hfs_put_super(struct super_block *sb) { - if (sb->s_dirt) - hfs_write_super(sb); + cancel_delayed_work_sync(&HFS_SB(sb)->mdb_work); hfs_mdb_close(sb); /* release the MDB's resources */ hfs_mdb_put(sb); } +static void flush_mdb(struct work_struct *work) +{ + struct hfs_sb_info *sbi; + struct super_block *sb; + + sbi = container_of(work, struct hfs_sb_info, mdb_work.work); + sb = sbi->sb; + + spin_lock(&sbi->work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->work_lock); + + hfs_mdb_commit(sb); +} + +void hfs_mark_mdb_dirty(struct super_block *sb) +{ + struct hfs_sb_info *sbi = HFS_SB(sb); + unsigned long delay; + + if (sb->s_flags & MS_RDONLY) + return; + + spin_lock(&sbi->work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->mdb_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->work_lock); +} + /* * hfs_statfs() * @@ -184,7 +181,6 @@ static const struct super_operations hfs_super_operations = { .write_inode = hfs_write_inode, .evict_inode = hfs_evict_inode, .put_super = hfs_put_super, - .write_super = hfs_write_super, .sync_fs = hfs_sync_fs, .statfs = hfs_statfs, .remount_fs = hfs_remount, @@ -387,7 +383,10 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; + sbi->sb = sb; sb->s_fs_info = sbi; + spin_lock_init(&sbi->work_lock); + INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb); res = -EINVAL; if (!parse_options((char *)data, sbi)) { diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 19cf291eb91..91b91fd3a90 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -13,12 +13,12 @@ /* dentry case-handling: just lowercase everything */ -static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd) +static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) { struct inode *inode; int diff; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index 1cad80c789c..4cfbe2edd29 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -153,7 +153,7 @@ done: kunmap(page); *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; - sb->s_dirt = 1; + hfsplus_mark_mdb_dirty(sb); dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); out: mutex_unlock(&sbi->alloc_mutex); @@ -228,7 +228,7 @@ out: set_page_dirty(page); kunmap(page); sbi->free_blocks += len; - sb->s_dirt = 1; + hfsplus_mark_mdb_dirty(sb); mutex_unlock(&sbi->alloc_mutex); return 0; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 26b53fb09f6..6b9f921ef2f 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -25,7 +25,7 @@ static inline void hfsplus_instantiate(struct dentry *dentry, /* Find the entry inside dir named dentry->d_name */ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode = NULL; struct hfs_find_data fd; @@ -316,7 +316,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); sbi->file_count++; - dst_dir->i_sb->s_dirt = 1; + hfsplus_mark_mdb_dirty(dst_dir->i_sb); out: mutex_unlock(&sbi->vh_mutex); return res; @@ -465,7 +465,7 @@ out: } static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { return hfsplus_mknod(dir, dentry, mode, 0); } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 4e75ac646fe..558dbb463a4 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -153,8 +153,11 @@ struct hfsplus_sb_info { gid_t gid; int part, session; - unsigned long flags; + + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work sync_work; /* FS sync delayed work */ + spinlock_t work_lock; /* protects sync_work and work_queued */ }; #define HFSPLUS_SB_WRITEBACKUP 0 @@ -428,7 +431,7 @@ int hfsplus_show_options(struct seq_file *, struct dentry *); /* super.c */ struct inode *hfsplus_iget(struct super_block *, unsigned long); -int hfsplus_sync_fs(struct super_block *sb, int wait); +void hfsplus_mark_mdb_dirty(struct super_block *sb); /* tables.c */ extern u16 hfsplus_case_fold_table[]; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 82b69ee4dac..3d8b4a675ba 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -168,7 +168,7 @@ const struct dentry_operations hfsplus_dentry_operations = { }; static struct dentry *hfsplus_file_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { struct hfs_find_data fd; struct super_block *sb = dir->i_sb; @@ -431,7 +431,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) sbi->file_count++; insert_inode_hash(inode); mark_inode_dirty(inode); - sb->s_dirt = 1; + hfsplus_mark_mdb_dirty(sb); return inode; } @@ -442,7 +442,7 @@ void hfsplus_delete_inode(struct inode *inode) if (S_ISDIR(inode->i_mode)) { HFSPLUS_SB(sb)->folder_count--; - sb->s_dirt = 1; + hfsplus_mark_mdb_dirty(sb); return; } HFSPLUS_SB(sb)->file_count--; @@ -455,7 +455,7 @@ void hfsplus_delete_inode(struct inode *inode) inode->i_size = 0; hfsplus_file_truncate(inode); } - sb->s_dirt = 1; + hfsplus_mark_mdb_dirty(sb); } void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index a9bca4b8768..47333209801 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -124,7 +124,7 @@ static int hfsplus_system_write_inode(struct inode *inode) if (fork->total_size != cpu_to_be64(inode->i_size)) { set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags); - inode->i_sb->s_dirt = 1; + hfsplus_mark_mdb_dirty(inode->i_sb); } hfsplus_inode_write_fork(inode, fork); if (tree) @@ -161,7 +161,7 @@ static void hfsplus_evict_inode(struct inode *inode) } } -int hfsplus_sync_fs(struct super_block *sb, int wait) +static int hfsplus_sync_fs(struct super_block *sb, int wait) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct hfsplus_vh *vhdr = sbi->s_vhdr; @@ -171,9 +171,7 @@ int hfsplus_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - dprint(DBG_SUPER, "hfsplus_write_super\n"); - - sb->s_dirt = 0; + dprint(DBG_SUPER, "hfsplus_sync_fs\n"); /* * Explicitly write out the special metadata inodes. @@ -226,12 +224,34 @@ out: return error; } -static void hfsplus_write_super(struct super_block *sb) +static void delayed_sync_fs(struct work_struct *work) { - if (!(sb->s_flags & MS_RDONLY)) - hfsplus_sync_fs(sb, 1); - else - sb->s_dirt = 0; + struct hfsplus_sb_info *sbi; + + sbi = container_of(work, struct hfsplus_sb_info, sync_work.work); + + spin_lock(&sbi->work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->work_lock); + + hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); +} + +void hfsplus_mark_mdb_dirty(struct super_block *sb) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + unsigned long delay; + + if (sb->s_flags & MS_RDONLY) + return; + + spin_lock(&sbi->work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->sync_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->work_lock); } static void hfsplus_put_super(struct super_block *sb) @@ -240,8 +260,7 @@ static void hfsplus_put_super(struct super_block *sb) dprint(DBG_SUPER, "hfsplus_put_super\n"); - if (!sb->s_fs_info) - return; + cancel_delayed_work_sync(&sbi->sync_work); if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) { struct hfsplus_vh *vhdr = sbi->s_vhdr; @@ -328,7 +347,6 @@ static const struct super_operations hfsplus_sops = { .write_inode = hfsplus_write_inode, .evict_inode = hfsplus_evict_inode, .put_super = hfsplus_put_super, - .write_super = hfsplus_write_super, .sync_fs = hfsplus_sync_fs, .statfs = hfsplus_statfs, .remount_fs = hfsplus_remount, @@ -355,6 +373,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; mutex_init(&sbi->alloc_mutex); mutex_init(&sbi->vh_mutex); + spin_lock_init(&sbi->work_lock); + INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); hfsplus_fill_defaults(sbi); err = -EINVAL; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2afa5bbccf9..124146543aa 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -553,7 +553,7 @@ static int read_name(struct inode *ino, char *name) } int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct inode *inode; char *name; @@ -595,7 +595,7 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode; char *name; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index b8472f803f4..78e12b2e0ea 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -189,7 +189,7 @@ out: * to tell read_inode to read fnode or not. */ -struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index c07ef1f1ced..ac1ead194db 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -220,7 +220,7 @@ extern const struct dentry_operations hpfs_dentry_operations; /* dir.c */ -struct dentry *hpfs_lookup(struct inode *, struct dentry *, struct nameidata *); +struct dentry *hpfs_lookup(struct inode *, struct dentry *, unsigned int); extern const struct file_operations hpfs_dir_ops; /* dnode.c */ diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 9083ef8af58..bc9082482f6 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -115,7 +115,7 @@ bail: return err; } -static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) +static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index d4f93b52cec..c1dffe47fde 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -138,7 +138,7 @@ static int file_removed(struct dentry *dentry, const char *file) } static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct dentry *proc_dentry, *parent; struct qstr *name = &dentry->d_name; @@ -420,8 +420,7 @@ static int hppfs_open(struct inode *inode, struct file *file) { const struct cred *cred = file->f_cred; struct hppfs_private *data; - struct vfsmount *proc_mnt; - struct dentry *proc_dentry; + struct path path; char *host_file; int err, fd, type, filter; @@ -434,12 +433,11 @@ static int hppfs_open(struct inode *inode, struct file *file) if (host_file == NULL) goto out_free2; - proc_dentry = HPPFS_I(inode)->proc_dentry; - proc_mnt = inode->i_sb->s_fs_info; + path.mnt = inode->i_sb->s_fs_info; + path.dentry = HPPFS_I(inode)->proc_dentry; /* XXX This isn't closed anywhere */ - data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), - file_mode(file->f_mode), cred); + data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); err = PTR_ERR(data->proc_file); if (IS_ERR(data->proc_file)) goto out_free1; @@ -484,8 +482,7 @@ static int hppfs_dir_open(struct inode *inode, struct file *file) { const struct cred *cred = file->f_cred; struct hppfs_private *data; - struct vfsmount *proc_mnt; - struct dentry *proc_dentry; + struct path path; int err; err = -ENOMEM; @@ -493,10 +490,9 @@ static int hppfs_dir_open(struct inode *inode, struct file *file) if (data == NULL) goto out; - proc_dentry = HPPFS_I(inode)->proc_dentry; - proc_mnt = inode->i_sb->s_fs_info; - data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), - file_mode(file->f_mode), cred); + path.mnt = inode->i_sb->s_fs_info; + path.dentry = HPPFS_I(inode)->proc_dentry; + data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); err = PTR_ERR(data->proc_file); if (IS_ERR(data->proc_file)) goto out_free; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index cc9281b6c62..e13e9bdb0bf 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -565,7 +565,7 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mod return retval; } -static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) +static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); } diff --git a/fs/inode.c b/fs/inode.c index c99163b1b31..775cbabd4fa 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -182,7 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) } inode->i_private = NULL; inode->i_mapping = mapping; - INIT_LIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ + INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; #endif diff --git a/fs/internal.h b/fs/internal.h index 18bc216ea09..a6fd56c68b1 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -42,6 +42,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) extern void __init chrdev_init(void); /* + * namei.c + */ +extern int __inode_permission(struct inode *, int); + +/* * namespace.c */ extern int copy_mount_options(const void __user *, unsigned long *); @@ -50,8 +55,6 @@ extern int copy_mount_string(const void __user *, char **); extern struct vfsmount *lookup_mnt(struct path *); extern int finish_automount(struct vfsmount *, struct path *); -extern void mnt_make_longterm(struct vfsmount *); -extern void mnt_make_shortterm(struct vfsmount *); extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); @@ -84,9 +87,6 @@ extern struct super_block *user_get_super(dev_t); /* * open.c */ -struct nameidata; -extern struct file *nameidata_to_filp(struct nameidata *); -extern void release_open_intent(struct nameidata *); struct open_flags { int open_flag; umode_t mode; diff --git a/fs/isofs/export.c b/fs/isofs/export.c index aa4356d09ee..1d3804492aa 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -134,6 +134,7 @@ isofs_export_encode_fh(struct inode *inode, len = 3; fh32[0] = ei->i_iget5_block; fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */ + fh16[3] = 0; /* avoid leaking uninitialized data */ fh32[2] = inode->i_generation; if (parent) { struct iso_inode_info *eparent; diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 0e73f63d927..3620ad1ea9b 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -114,7 +114,7 @@ extern int isofs_name_translate(struct iso_directory_record *, char *, struct in int get_joliet_filename(struct iso_directory_record *, unsigned char *, struct inode *); int get_acorn_filename(struct iso_directory_record *, char *, struct inode *); -extern struct dentry *isofs_lookup(struct inode *, struct dentry *, struct nameidata *); +extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int flags); extern struct buffer_head *isofs_bread(struct inode *, sector_t); extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long); diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 1e2946f2a69..c167028844e 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -163,7 +163,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, return 0; } -struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int found; unsigned long uninitialized_var(block); diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 008bf062fd2..a748fe21465 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -265,8 +265,11 @@ int journal_recover(journal_t *journal) if (!err) err = err2; /* Flush disk caches to get replayed data on the permanent storage */ - if (journal->j_flags & JFS_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (journal->j_flags & JFS_BARRIER) { + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (!err) + err = err2; + } return err; } diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index b56018896d5..ad7774d3209 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -25,9 +25,9 @@ static int jffs2_readdir (struct file *, void *, filldir_t); static int jffs2_create (struct inode *,struct dentry *,umode_t, - struct nameidata *); + bool); static struct dentry *jffs2_lookup (struct inode *,struct dentry *, - struct nameidata *); + unsigned int); static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); static int jffs2_symlink (struct inode *,struct dentry *,const char *); @@ -74,7 +74,7 @@ const struct inode_operations jffs2_dir_inode_operations = nice and simple */ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, - struct nameidata *nd) + unsigned int flags) { struct jffs2_inode_info *dir_f; struct jffs2_full_dirent *fd = NULL, *fd_list; @@ -175,7 +175,7 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) static int jffs2_create(struct inode *dir_i, struct dentry *dentry, - umode_t mode, struct nameidata *nd) + umode_t mode, bool excl) { struct jffs2_raw_inode *ri; struct jffs2_inode_info *f, *dir_f; @@ -226,8 +226,8 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, __func__, inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->pino_nlink, inode->i_mapping->nrpages); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); return 0; fail: @@ -446,8 +446,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); return 0; fail: @@ -591,8 +591,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); return 0; fail: @@ -766,8 +766,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); return 0; fail: diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 07c91ca6017..3b91a7ad608 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -73,7 +73,7 @@ static inline void free_ea_wmap(struct inode *inode) * */ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { int rc = 0; tid_t tid; /* transaction id */ @@ -176,8 +176,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, unlock_new_inode(ip); iput(ip); } else { - d_instantiate(dentry, ip); unlock_new_inode(ip); + d_instantiate(dentry, ip); } out2: @@ -309,8 +309,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) unlock_new_inode(ip); iput(ip); } else { - d_instantiate(dentry, ip); unlock_new_inode(ip); + d_instantiate(dentry, ip); } out2: @@ -1043,8 +1043,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - d_instantiate(dentry, ip); unlock_new_inode(ip); + d_instantiate(dentry, ip); } out2: @@ -1424,8 +1424,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - d_instantiate(dentry, ip); unlock_new_inode(ip); + d_instantiate(dentry, ip); } out1: @@ -1436,7 +1436,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, return rc; } -static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd) +static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { struct btstack btstack; ino_t inum; @@ -1570,7 +1570,7 @@ out: return result; } -static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) +static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags) { /* * This is not negative dentry. Always valid. @@ -1589,7 +1589,7 @@ static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) * This may be nfsd (or something), anyway, we can't see the * intent of this. So, since this can be for creation, drop it. */ - if (!nd) + if (!flags) return 0; /* @@ -1597,7 +1597,7 @@ static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) * case sensitive name which is specified by user if this is * for creation. */ - if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; return 1; } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 4a82950f412..c55c7452d28 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -601,6 +601,11 @@ static int jfs_sync_fs(struct super_block *sb, int wait) /* log == NULL indicates read-only mount */ if (log) { + /* + * Write quota structures to quota file, sync_blockdev() will + * write them to disk later + */ + dquot_writeback_dquots(sb, -1); jfs_flush_journal(log, wait); jfs_syncpt(log, 0); } diff --git a/fs/libfs.c b/fs/libfs.c index f86ec27a423..a74cb1725ac 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -53,7 +53,7 @@ static int simple_delete_dentry(const struct dentry *dentry) * Lookup the data. This is trivial - if the dentry didn't already * exist, we know it is negative. Set d_op to delete negative dentries. */ -struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { static const struct dentry_operations simple_dentry_operations = { .d_delete = simple_delete_dentry, @@ -222,15 +222,15 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, const struct super_operations *ops, const struct dentry_operations *dops, unsigned long magic) { - struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); + struct super_block *s; struct dentry *dentry; struct inode *root; struct qstr d_name = QSTR_INIT(name, strlen(name)); + s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL); if (IS_ERR(s)) return ERR_CAST(s); - s->s_flags = MS_NOUSER; s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; diff --git a/fs/locks.c b/fs/locks.c index fce6238d52c..82c353304f9 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -308,7 +308,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, return 0; } -static int assign_type(struct file_lock *fl, int type) +static int assign_type(struct file_lock *fl, long type) { switch (type) { case F_RDLCK: @@ -445,7 +445,7 @@ static const struct lock_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, int type, struct file_lock *fl) +static int lease_init(struct file *filp, long type, struct file_lock *fl) { if (assign_type(fl, type) != 0) return -EINVAL; @@ -463,7 +463,7 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl) } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lock *lease_alloc(struct file *filp, int type) +static struct file_lock *lease_alloc(struct file *filp, long type) { struct file_lock *fl = locks_alloc_lock(); int error = -ENOMEM; diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index bea5d1b9954..26e4a941532 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -349,7 +349,7 @@ static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) } static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct page *page; struct logfs_disk_dentry *dd; @@ -502,7 +502,7 @@ static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct inode *inode; diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 97bca623d89..345c24b8a6f 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -519,7 +519,7 @@ static struct dentry *logfs_get_sb_device(struct logfs_super *super, log_super("LogFS: Start mount %x\n", mount_count++); err = -EINVAL; - sb = sget(type, logfs_sb_test, logfs_sb_set, super); + sb = sget(type, logfs_sb_test, logfs_sb_set, flags | MS_NOATIME, super); if (IS_ERR(sb)) { super->s_devops->put_device(super); kfree(super); @@ -542,7 +542,6 @@ static struct dentry *logfs_get_sb_device(struct logfs_super *super, sb->s_maxbytes = (1ull << 43) - 1; sb->s_max_links = LOGFS_LINK_MAX; sb->s_op = &logfs_super_operations; - sb->s_flags = flags | MS_NOATIME; err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY); if (err) diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 2d0ee178630..0db73d9dd66 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -18,7 +18,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode) return err; } -static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) { struct inode * inode = NULL; ino_t ino; @@ -55,7 +55,7 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, } static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { return minix_mknod(dir, dentry, mode, 0); } diff --git a/fs/mount.h b/fs/mount.h index 4ef36d93e5a..4f291f9de64 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -22,7 +22,6 @@ struct mount { struct vfsmount mnt; #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; - atomic_t mnt_longterm; /* how many of the refs are longterm */ #else int mnt_count; int mnt_writers; @@ -49,6 +48,8 @@ struct mount { int mnt_ghosts; }; +#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ + static inline struct mount *real_mount(struct vfsmount *mnt) { return container_of(mnt, struct mount, mnt); @@ -59,6 +60,12 @@ static inline int mnt_has_parent(struct mount *mnt) return mnt != mnt->mnt_parent; } +static inline int is_mounted(struct vfsmount *mnt) +{ + /* neither detached nor internal? */ + return !IS_ERR_OR_NULL(real_mount(mnt)); +} + extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int); static inline void get_mnt_ns(struct mnt_namespace *ns) @@ -67,10 +74,12 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) } struct proc_mounts { - struct seq_file m; /* must be the first element */ + struct seq_file m; struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); }; +#define proc_mounts(p) (container_of((p), struct proc_mounts, m)) + extern const struct seq_operations mounts_op; diff --git a/fs/namei.c b/fs/namei.c index 7d694194024..2ccc35c4dc2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -315,31 +315,22 @@ static inline int do_inode_permission(struct inode *inode, int mask) } /** - * inode_permission - check for access rights to a given inode - * @inode: inode to check permission on - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) + * __inode_permission - Check for access rights to a given inode + * @inode: Inode to check permission on + * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * - * Used to check for read/write/execute permissions on an inode. - * We use "fsuid" for this, letting us set arbitrary permissions - * for filesystem access without changing the "normal" uids which - * are used for other things. + * Check for read/write/execute permissions on an inode. * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. + * + * This does not check for a read-only file system. You probably want + * inode_permission(). */ -int inode_permission(struct inode *inode, int mask) +int __inode_permission(struct inode *inode, int mask) { int retval; if (unlikely(mask & MAY_WRITE)) { - umode_t mode = inode->i_mode; - - /* - * Nobody gets write access to a read-only fs. - */ - if (IS_RDONLY(inode) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) - return -EROFS; - /* * Nobody gets write access to an immutable file. */ @@ -359,6 +350,47 @@ int inode_permission(struct inode *inode, int mask) } /** + * sb_permission - Check superblock-level permissions + * @sb: Superblock of inode to check permission on + * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Separate out file-system wide checks from inode-specific permission checks. + */ +static int sb_permission(struct super_block *sb, struct inode *inode, int mask) +{ + if (unlikely(mask & MAY_WRITE)) { + umode_t mode = inode->i_mode; + + /* Nobody gets write access to a read-only fs. */ + if ((sb->s_flags & MS_RDONLY) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + return -EROFS; + } + return 0; +} + +/** + * inode_permission - Check for access rights to a given inode + * @inode: Inode to check permission on + * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Check for read/write/execute permissions on an inode. We use fs[ug]id for + * this, letting us set arbitrary permissions for filesystem access without + * changing the "normal" UIDs which are used for other things. + * + * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. + */ +int inode_permission(struct inode *inode, int mask) +{ + int retval; + + retval = sb_permission(inode->i_sb, inode, mask); + if (retval) + return retval; + return __inode_permission(inode, mask); +} + +/** * path_get - get a reference to a path * @path: path to get the reference to * @@ -395,6 +427,18 @@ EXPORT_SYMBOL(path_put); * to restart the path walk from the beginning in ref-walk mode. */ +static inline void lock_rcu_walk(void) +{ + br_read_lock(&vfsmount_lock); + rcu_read_lock(); +} + +static inline void unlock_rcu_walk(void) +{ + rcu_read_unlock(); + br_read_unlock(&vfsmount_lock); +} + /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data @@ -448,8 +492,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) } mntget(nd->path.mnt); - rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); + unlock_rcu_walk(); nd->flags &= ~LOOKUP_RCU; return 0; @@ -463,25 +506,9 @@ err_root: return -ECHILD; } -/** - * release_open_intent - free up open intent resources - * @nd: pointer to nameidata - */ -void release_open_intent(struct nameidata *nd) +static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { - struct file *file = nd->intent.open.file; - - if (file && !IS_ERR(file)) { - if (file->f_path.dentry == NULL) - put_filp(file); - else - fput(file); - } -} - -static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - return dentry->d_op->d_revalidate(dentry, nd); + return dentry->d_op->d_revalidate(dentry, flags); } /** @@ -506,15 +533,13 @@ static int complete_walk(struct nameidata *nd) spin_lock(&dentry->d_lock); if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { spin_unlock(&dentry->d_lock); - rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); + unlock_rcu_walk(); return -ECHILD; } BUG_ON(nd->inode != dentry->d_inode); spin_unlock(&dentry->d_lock); mntget(nd->path.mnt); - rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); + unlock_rcu_walk(); } if (likely(!(nd->flags & LOOKUP_JUMPED))) @@ -527,7 +552,7 @@ static int complete_walk(struct nameidata *nd) return 0; /* Note: we do not d_invalidate() */ - status = d_revalidate(dentry, nd); + status = d_revalidate(dentry, nd->flags); if (status > 0) return 0; @@ -602,10 +627,25 @@ static inline void path_to_nameidata(const struct path *path, nd->path.dentry = path->dentry; } +/* + * Helper to directly jump to a known parsed path from ->follow_link, + * caller must have taken a reference to path beforehand. + */ +void nd_jump_link(struct nameidata *nd, struct path *path) +{ + path_put(&nd->path); + + nd->path = *path; + nd->inode = nd->path.dentry->d_inode; + nd->flags |= LOOKUP_JUMPED; + + BUG_ON(nd->inode->i_op->follow_link); +} + static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) { struct inode *inode = link->dentry->d_inode; - if (!IS_ERR(cookie) && inode->i_op->put_link) + if (inode->i_op->put_link) inode->i_op->put_link(link->dentry, nd, cookie); path_put(link); } @@ -613,19 +653,19 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki static __always_inline int follow_link(struct path *link, struct nameidata *nd, void **p) { - int error; struct dentry *dentry = link->dentry; + int error; + char *s; BUG_ON(nd->flags & LOOKUP_RCU); if (link->mnt == nd->path.mnt) mntget(link->mnt); - if (unlikely(current->total_link_count >= 40)) { - *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ - path_put(&nd->path); - return -ELOOP; - } + error = -ELOOP; + if (unlikely(current->total_link_count >= 40)) + goto out_put_nd_path; + cond_resched(); current->total_link_count++; @@ -633,30 +673,28 @@ follow_link(struct path *link, struct nameidata *nd, void **p) nd_set_link(nd, NULL); error = security_inode_follow_link(link->dentry, nd); - if (error) { - *p = ERR_PTR(error); /* no ->put_link(), please */ - path_put(&nd->path); - return error; - } + if (error) + goto out_put_nd_path; nd->last_type = LAST_BIND; *p = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(*p); - if (!IS_ERR(*p)) { - char *s = nd_get_link(nd); - error = 0; - if (s) - error = __vfs_follow_link(nd, s); - else if (nd->last_type == LAST_BIND) { - nd->flags |= LOOKUP_JUMPED; - nd->inode = nd->path.dentry->d_inode; - if (nd->inode->i_op->follow_link) { - /* stepped on a _really_ weird one */ - path_put(&nd->path); - error = -ELOOP; - } - } + if (IS_ERR(*p)) + goto out_put_nd_path; + + error = 0; + s = nd_get_link(nd); + if (s) { + error = __vfs_follow_link(nd, s); + if (unlikely(error)) + put_link(nd, link, *p); } + + return error; + +out_put_nd_path: + path_put(&nd->path); + path_put(link); return error; } @@ -675,6 +713,16 @@ static int follow_up_rcu(struct path *path) return 1; } +/* + * follow_up - Find the mountpoint of path's vfsmount + * + * Given a path, find the mountpoint of its source file system. + * Replace @path with the path of the mountpoint in the parent mount. + * Up is towards /. + * + * Return 1 if we went up a level and 0 if we were already at the + * root. + */ int follow_up(struct path *path) { struct mount *mnt = real_mount(path->mnt); @@ -683,7 +731,7 @@ int follow_up(struct path *path) br_read_lock(&vfsmount_lock); parent = mnt->mnt_parent; - if (&parent->mnt == path->mnt) { + if (parent == mnt) { br_read_unlock(&vfsmount_lock); return 0; } @@ -946,8 +994,7 @@ failed: nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); + unlock_rcu_walk(); return -ECHILD; } @@ -1048,7 +1095,7 @@ static void follow_dotdot(struct nameidata *nd) * dir->d_inode->i_mutex must be held */ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, - struct nameidata *nd, bool *need_lookup) + unsigned int flags, bool *need_lookup) { struct dentry *dentry; int error; @@ -1059,7 +1106,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, if (d_need_lookup(dentry)) { *need_lookup = true; } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) { - error = d_revalidate(dentry, nd); + error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (error < 0) { dput(dentry); @@ -1089,7 +1136,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, * dir->d_inode->i_mutex must be held */ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct dentry *old; @@ -1099,7 +1146,7 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOENT); } - old = dir->i_op->lookup(dir, dentry, nd); + old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; @@ -1108,16 +1155,16 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, } static struct dentry *__lookup_hash(struct qstr *name, - struct dentry *base, struct nameidata *nd) + struct dentry *base, unsigned int flags) { bool need_lookup; struct dentry *dentry; - dentry = lookup_dcache(name, base, nd, &need_lookup); + dentry = lookup_dcache(name, base, flags, &need_lookup); if (!need_lookup) return dentry; - return lookup_real(base->d_inode, dentry, nd); + return lookup_real(base->d_inode, dentry, flags); } /* @@ -1167,7 +1214,7 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name, if (unlikely(d_need_lookup(dentry))) goto unlazy; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - status = d_revalidate(dentry, nd); + status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { if (status != -ECHILD) need_reval = 0; @@ -1197,7 +1244,7 @@ unlazy: } if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) - status = d_revalidate(dentry, nd); + status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { if (status < 0) { dput(dentry); @@ -1236,7 +1283,7 @@ static int lookup_slow(struct nameidata *nd, struct qstr *name, BUG_ON(nd->inode != parent->d_inode); mutex_lock(&parent->d_inode->i_mutex); - dentry = __lookup_hash(name, parent, nd); + dentry = __lookup_hash(name, parent, nd->flags); mutex_unlock(&parent->d_inode->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1284,8 +1331,7 @@ static void terminate_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); + unlock_rcu_walk(); } } @@ -1383,9 +1429,10 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) void *cookie; res = follow_link(&link, nd, &cookie); - if (!res) - res = walk_component(nd, path, &nd->last, - nd->last_type, LOOKUP_FOLLOW); + if (res) + break; + res = walk_component(nd, path, &nd->last, + nd->last_type, LOOKUP_FOLLOW); put_link(nd, &link, cookie); } while (res > 0); @@ -1651,8 +1698,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - br_read_lock(&vfsmount_lock); - rcu_read_lock(); + lock_rcu_walk(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); @@ -1664,8 +1710,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (*name=='/') { if (flags & LOOKUP_RCU) { - br_read_lock(&vfsmount_lock); - rcu_read_lock(); + lock_rcu_walk(); set_root_rcu(nd); } else { set_root(nd); @@ -1677,8 +1722,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct fs_struct *fs = current->fs; unsigned seq; - br_read_lock(&vfsmount_lock); - rcu_read_lock(); + lock_rcu_walk(); do { seq = read_seqcount_begin(&fs->seq); @@ -1713,8 +1757,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (fput_needed) *fp = file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - br_read_lock(&vfsmount_lock); - rcu_read_lock(); + lock_rcu_walk(); } else { path_get(&file->f_path); fput_light(file, fput_needed); @@ -1777,8 +1820,9 @@ static int path_lookupat(int dfd, const char *name, struct path link = path; nd->flags |= LOOKUP_PARENT; err = follow_link(&link, nd, &cookie); - if (!err) - err = lookup_last(nd, &path); + if (err) + break; + err = lookup_last(nd, &path); put_link(nd, &link, cookie); } } @@ -1821,9 +1865,27 @@ static int do_path_lookup(int dfd, const char *name, return retval; } -int kern_path_parent(const char *name, struct nameidata *nd) +/* does lookup, returns the object with parent locked */ +struct dentry *kern_path_locked(const char *name, struct path *path) { - return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); + struct nameidata nd; + struct dentry *d; + int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd); + if (err) + return ERR_PTR(err); + if (nd.last_type != LAST_NORM) { + path_put(&nd.path); + return ERR_PTR(-EINVAL); + } + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = __lookup_hash(&nd.last, nd.path.dentry, 0); + if (IS_ERR(d)) { + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + return d; + } + *path = nd.path; + return d; } int kern_path(const char *name, unsigned int flags, struct path *path) @@ -1866,7 +1928,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, */ static struct dentry *lookup_hash(struct nameidata *nd) { - return __lookup_hash(&nd->last, nd->path.dentry, nd); + return __lookup_hash(&nd->last, nd->path.dentry, nd->flags); } /** @@ -1913,7 +1975,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (err) return ERR_PTR(err); - return __lookup_hash(&this, base, NULL); + return __lookup_hash(&this, base, 0); } int user_path_at_empty(int dfd, const char __user *name, unsigned flags, @@ -2086,10 +2148,9 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool want_excl) { int error = may_create(dir, dentry); - if (error) return error; @@ -2100,7 +2161,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(dir, dentry, mode, nd); + error = dir->i_op->create(dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; @@ -2187,21 +2248,275 @@ static inline int open_to_namei_flags(int flag) return flag; } +static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) +{ + int error = security_path_mknod(dir, dentry, mode, 0); + if (error) + return error; + + error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); + if (error) + return error; + + return security_inode_create(dir->dentry->d_inode, dentry, mode); +} + /* - * Handle the last step of open() + * Attempt to atomically look up, create and open a file from a negative + * dentry. + * + * Returns 0 if successful. The file will have been created and attached to + * @file by the filesystem calling finish_open(). + * + * Returns 1 if the file was looked up only or didn't need creating. The + * caller will need to perform the open themselves. @path will have been + * updated to point to the new dentry. This may be negative. + * + * Returns an error code otherwise. + */ +static int atomic_open(struct nameidata *nd, struct dentry *dentry, + struct path *path, struct file *file, + const struct open_flags *op, + bool *want_write, bool need_lookup, + int *opened) +{ + struct inode *dir = nd->path.dentry->d_inode; + unsigned open_flag = open_to_namei_flags(op->open_flag); + umode_t mode; + int error; + int acc_mode; + int create_error = 0; + struct dentry *const DENTRY_NOT_SET = (void *) -1UL; + + BUG_ON(dentry->d_inode); + + /* Don't create child dentry for a dead directory. */ + if (unlikely(IS_DEADDIR(dir))) { + error = -ENOENT; + goto out; + } + + mode = op->mode & S_IALLUGO; + if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) + mode &= ~current_umask(); + + if (open_flag & O_EXCL) { + open_flag &= ~O_TRUNC; + *opened |= FILE_CREATED; + } + + /* + * Checking write permission is tricky, bacuse we don't know if we are + * going to actually need it: O_CREAT opens should work as long as the + * file exists. But checking existence breaks atomicity. The trick is + * to check access and if not granted clear O_CREAT from the flags. + * + * Another problem is returing the "right" error value (e.g. for an + * O_EXCL open we want to return EEXIST not EROFS). + */ + if ((open_flag & (O_CREAT | O_TRUNC)) || + (open_flag & O_ACCMODE) != O_RDONLY) { + error = mnt_want_write(nd->path.mnt); + if (!error) { + *want_write = true; + } else if (!(open_flag & O_CREAT)) { + /* + * No O_CREATE -> atomicity not a requirement -> fall + * back to lookup + open + */ + goto no_open; + } else if (open_flag & (O_EXCL | O_TRUNC)) { + /* Fall back and fail with the right error */ + create_error = error; + goto no_open; + } else { + /* No side effects, safe to clear O_CREAT */ + create_error = error; + open_flag &= ~O_CREAT; + } + } + + if (open_flag & O_CREAT) { + error = may_o_create(&nd->path, dentry, op->mode); + if (error) { + create_error = error; + if (open_flag & O_EXCL) + goto no_open; + open_flag &= ~O_CREAT; + } + } + + if (nd->flags & LOOKUP_DIRECTORY) + open_flag |= O_DIRECTORY; + + file->f_path.dentry = DENTRY_NOT_SET; + file->f_path.mnt = nd->path.mnt; + error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode, + opened); + if (error < 0) { + if (create_error && error == -ENOENT) + error = create_error; + goto out; + } + + acc_mode = op->acc_mode; + if (*opened & FILE_CREATED) { + fsnotify_create(dir, dentry); + acc_mode = MAY_OPEN; + } + + if (error) { /* returned 1, that is */ + if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { + error = -EIO; + goto out; + } + if (file->f_path.dentry) { + dput(dentry); + dentry = file->f_path.dentry; + } + goto looked_up; + } + + /* + * We didn't have the inode before the open, so check open permission + * here. + */ + error = may_open(&file->f_path, acc_mode, open_flag); + if (error) + fput(file); + +out: + dput(dentry); + return error; + +no_open: + if (need_lookup) { + dentry = lookup_real(dir, dentry, nd->flags); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (create_error) { + int open_flag = op->open_flag; + + error = create_error; + if ((open_flag & O_EXCL)) { + if (!dentry->d_inode) + goto out; + } else if (!dentry->d_inode) { + goto out; + } else if ((open_flag & O_TRUNC) && + S_ISREG(dentry->d_inode->i_mode)) { + goto out; + } + /* will fail later, go on to get the right error */ + } + } +looked_up: + path->dentry = dentry; + path->mnt = nd->path.mnt; + return 1; +} + +/* + * Look up and maybe create and open the last component. + * + * Must be called with i_mutex held on parent. + * + * Returns 0 if the file was successfully atomically created (if necessary) and + * opened. In this case the file will be returned attached to @file. + * + * Returns 1 if the file was not completely opened at this time, though lookups + * and creations will have been performed and the dentry returned in @path will + * be positive upon return if O_CREAT was specified. If O_CREAT wasn't + * specified then a negative dentry may be returned. + * + * An error code is returned otherwise. + * + * FILE_CREATE will be set in @*opened if the dentry was created and will be + * cleared otherwise prior to returning. */ -static struct file *do_last(struct nameidata *nd, struct path *path, - const struct open_flags *op, const char *pathname) +static int lookup_open(struct nameidata *nd, struct path *path, + struct file *file, + const struct open_flags *op, + bool *want_write, int *opened) { struct dentry *dir = nd->path.dentry; + struct inode *dir_inode = dir->d_inode; struct dentry *dentry; + int error; + bool need_lookup; + + *opened &= ~FILE_CREATED; + dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + /* Cached positive dentry: will open in f_op->open */ + if (!need_lookup && dentry->d_inode) + goto out_no_open; + + if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { + return atomic_open(nd, dentry, path, file, op, want_write, + need_lookup, opened); + } + + if (need_lookup) { + BUG_ON(dentry->d_inode); + + dentry = lookup_real(dir_inode, dentry, nd->flags); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + + /* Negative dentry, just create the file */ + if (!dentry->d_inode && (op->open_flag & O_CREAT)) { + umode_t mode = op->mode; + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); + /* + * This write is needed to ensure that a + * rw->ro transition does not occur between + * the time when the file is created and when + * a permanent write count is taken through + * the 'struct file' in finish_open(). + */ + error = mnt_want_write(nd->path.mnt); + if (error) + goto out_dput; + *want_write = true; + *opened |= FILE_CREATED; + error = security_path_mknod(&nd->path, dentry, mode, 0); + if (error) + goto out_dput; + error = vfs_create(dir->d_inode, dentry, mode, + nd->flags & LOOKUP_EXCL); + if (error) + goto out_dput; + } +out_no_open: + path->dentry = dentry; + path->mnt = nd->path.mnt; + return 1; + +out_dput: + dput(dentry); + return error; +} + +/* + * Handle the last step of open() + */ +static int do_last(struct nameidata *nd, struct path *path, + struct file *file, const struct open_flags *op, + int *opened, const char *pathname) +{ + struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; - int will_truncate = open_flag & O_TRUNC; - int want_write = 0; + bool will_truncate = (open_flag & O_TRUNC) != 0; + bool want_write = false; int acc_mode = op->acc_mode; - struct file *filp; struct inode *inode; - int symlink_ok = 0; + bool symlink_ok = false; struct path save_parent = { .dentry = NULL, .mnt = NULL }; bool retried = false; int error; @@ -2214,112 +2529,99 @@ static struct file *do_last(struct nameidata *nd, struct path *path, case LAST_DOT: error = handle_dots(nd, nd->last_type); if (error) - return ERR_PTR(error); + return error; /* fallthrough */ case LAST_ROOT: error = complete_walk(nd); if (error) - return ERR_PTR(error); + return error; audit_inode(pathname, nd->path.dentry); if (open_flag & O_CREAT) { error = -EISDIR; - goto exit; + goto out; } - goto ok; + goto finish_open; case LAST_BIND: error = complete_walk(nd); if (error) - return ERR_PTR(error); + return error; audit_inode(pathname, dir); - goto ok; + goto finish_open; } if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) - symlink_ok = 1; + symlink_ok = true; /* we _can_ be in RCU mode here */ error = lookup_fast(nd, &nd->last, path, &inode); - if (unlikely(error)) { - if (error < 0) - goto exit; + if (likely(!error)) + goto finish_lookup; - error = lookup_slow(nd, &nd->last, path); - if (error < 0) - goto exit; + if (error < 0) + goto out; - inode = path->dentry->d_inode; - } - goto finish_lookup; - } - - /* create side of things */ - /* - * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been - * cleared when we got to the last component we are about to look up - */ - error = complete_walk(nd); - if (error) - return ERR_PTR(error); + BUG_ON(nd->inode != dir->d_inode); + } else { + /* create side of things */ + /* + * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED + * has been cleared when we got to the last component we are + * about to look up + */ + error = complete_walk(nd); + if (error) + return error; - audit_inode(pathname, dir); - error = -EISDIR; - /* trailing slashes? */ - if (nd->last.name[nd->last.len]) - goto exit; + audit_inode(pathname, dir); + error = -EISDIR; + /* trailing slashes? */ + if (nd->last.name[nd->last.len]) + goto out; + } retry_lookup: mutex_lock(&dir->d_inode->i_mutex); + error = lookup_open(nd, path, file, op, &want_write, opened); + mutex_unlock(&dir->d_inode->i_mutex); - dentry = lookup_hash(nd); - error = PTR_ERR(dentry); - if (IS_ERR(dentry)) { - mutex_unlock(&dir->d_inode->i_mutex); - goto exit; - } + if (error <= 0) { + if (error) + goto out; - path->dentry = dentry; - path->mnt = nd->path.mnt; + if ((*opened & FILE_CREATED) || + !S_ISREG(file->f_path.dentry->d_inode->i_mode)) + will_truncate = false; - /* Negative dentry, just create the file */ - if (!dentry->d_inode) { - umode_t mode = op->mode; - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); - /* - * This write is needed to ensure that a - * rw->ro transition does not occur between - * the time when the file is created and when - * a permanent write count is taken through - * the 'struct file' in nameidata_to_filp(). - */ - error = mnt_want_write(nd->path.mnt); - if (error) - goto exit_mutex_unlock; - want_write = 1; + audit_inode(pathname, file->f_path.dentry); + goto opened; + } + + if (*opened & FILE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; - will_truncate = 0; + will_truncate = false; acc_mode = MAY_OPEN; - error = security_path_mknod(&nd->path, dentry, mode, 0); - if (error) - goto exit_mutex_unlock; - error = vfs_create(dir->d_inode, dentry, mode, nd); - if (error) - goto exit_mutex_unlock; - mutex_unlock(&dir->d_inode->i_mutex); - dput(nd->path.dentry); - nd->path.dentry = dentry; - goto common; + path_to_nameidata(path, nd); + goto finish_open_created; } /* * It already exists. */ - mutex_unlock(&dir->d_inode->i_mutex); audit_inode(pathname, path->dentry); + /* + * If atomic_open() acquired write access it is dropped now due to + * possible mount and symlink following (this might be optimized away if + * necessary...) + */ + if (want_write) { + mnt_drop_write(nd->path.mnt); + want_write = false; + } + error = -EEXIST; if (open_flag & O_EXCL) goto exit_dput; @@ -2338,18 +2640,18 @@ finish_lookup: error = -ENOENT; if (!inode) { path_to_nameidata(path, nd); - goto exit; + goto out; } if (should_follow_link(inode, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { error = -ECHILD; - goto exit; + goto out; } } BUG_ON(inode != path->dentry->d_inode); - return NULL; + return 1; } if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { @@ -2365,119 +2667,122 @@ finish_lookup: error = complete_walk(nd); if (error) { path_put(&save_parent); - return ERR_PTR(error); + return error; } error = -EISDIR; if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) - goto exit; + goto out; error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) - goto exit; + goto out; audit_inode(pathname, nd->path.dentry); -ok: +finish_open: if (!S_ISREG(nd->inode->i_mode)) - will_truncate = 0; + will_truncate = false; if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) - goto exit; - want_write = 1; + goto out; + want_write = true; } -common: +finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) - goto exit; - filp = nameidata_to_filp(nd); - if (filp == ERR_PTR(-EOPENSTALE) && save_parent.dentry && !retried) { - BUG_ON(save_parent.dentry != dir); - path_put(&nd->path); - nd->path = save_parent; - nd->inode = dir->d_inode; - save_parent.mnt = NULL; - save_parent.dentry = NULL; - if (want_write) { - mnt_drop_write(nd->path.mnt); - want_write = 0; - } - retried = true; - goto retry_lookup; - } - if (!IS_ERR(filp)) { - error = ima_file_check(filp, op->acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } + goto out; + file->f_path.mnt = nd->path.mnt; + error = finish_open(file, nd->path.dentry, NULL, opened); + if (error) { + if (error == -EOPENSTALE) + goto stale_open; + goto out; } - if (!IS_ERR(filp)) { - if (will_truncate) { - error = handle_truncate(filp); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } +opened: + error = open_check_o_direct(file); + if (error) + goto exit_fput; + error = ima_file_check(file, op->acc_mode); + if (error) + goto exit_fput; + + if (will_truncate) { + error = handle_truncate(file); + if (error) + goto exit_fput; } out: if (want_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); terminate_walk(nd); - return filp; + return error; -exit_mutex_unlock: - mutex_unlock(&dir->d_inode->i_mutex); exit_dput: path_put_conditional(path, nd); -exit: - filp = ERR_PTR(error); goto out; +exit_fput: + fput(file); + goto out; + +stale_open: + /* If no saved parent or already retried then can't retry */ + if (!save_parent.dentry || retried) + goto out; + + BUG_ON(save_parent.dentry != dir); + path_put(&nd->path); + nd->path = save_parent; + nd->inode = dir->d_inode; + save_parent.mnt = NULL; + save_parent.dentry = NULL; + if (want_write) { + mnt_drop_write(nd->path.mnt); + want_write = false; + } + retried = true; + goto retry_lookup; } static struct file *path_openat(int dfd, const char *pathname, struct nameidata *nd, const struct open_flags *op, int flags) { struct file *base = NULL; - struct file *filp; + struct file *file; struct path path; + int opened = 0; int error; - filp = get_empty_filp(); - if (!filp) + file = get_empty_filp(); + if (!file) return ERR_PTR(-ENFILE); - filp->f_flags = op->open_flag; - nd->intent.open.file = filp; - nd->intent.open.flags = open_to_namei_flags(op->open_flag); - nd->intent.open.create_mode = op->mode; + file->f_flags = op->open_flag; error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) - goto out_filp; + goto out; current->total_link_count = 0; error = link_path_walk(pathname, nd); if (unlikely(error)) - goto out_filp; + goto out; - filp = do_last(nd, &path, op, pathname); - while (unlikely(!filp)) { /* trailing symlink */ + error = do_last(nd, &path, file, op, &opened, pathname); + while (unlikely(error > 0)) { /* trailing symlink */ struct path link = path; void *cookie; if (!(nd->flags & LOOKUP_FOLLOW)) { path_put_conditional(&path, nd); path_put(&nd->path); - filp = ERR_PTR(-ELOOP); + error = -ELOOP; break; } nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); error = follow_link(&link, nd, &cookie); if (unlikely(error)) - filp = ERR_PTR(error); - else - filp = do_last(nd, &path, op, pathname); + break; + error = do_last(nd, &path, file, op, &opened, pathname); put_link(nd, &link, cookie); } out: @@ -2485,18 +2790,20 @@ out: path_put(&nd->root); if (base) fput(base); - release_open_intent(nd); - if (filp == ERR_PTR(-EOPENSTALE)) { - if (flags & LOOKUP_RCU) - filp = ERR_PTR(-ECHILD); - else - filp = ERR_PTR(-ESTALE); + if (!(opened & FILE_OPENED)) { + BUG_ON(!error); + put_filp(file); } - return filp; - -out_filp: - filp = ERR_PTR(error); - goto out; + if (unlikely(error)) { + if (error == -EOPENSTALE) { + if (flags & LOOKUP_RCU) + error = -ECHILD; + else + error = -ESTALE; + } + file = ERR_PTR(error); + } + return file; } struct file *do_filp_open(int dfd, const char *pathname, @@ -2551,7 +2858,6 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path goto out; nd.flags &= ~LOOKUP_PARENT; nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; - nd.intent.open.flags = O_EXCL; /* * Do the final lookup. @@ -2670,7 +2976,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, goto out_drop_write; switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(path.dentry->d_inode,dentry,mode,NULL); + error = vfs_create(path.dentry->d_inode,dentry,mode,true); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(path.dentry->d_inode,dentry,mode, diff --git a/fs/namespace.c b/fs/namespace.c index 1e4a5fe3d7b..c53d3381b0d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -515,8 +515,20 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, } /* - * lookup_mnt increments the ref count before returning - * the vfsmount struct. + * lookup_mnt - Return the first child mount mounted at path + * + * "First" means first mounted chronologically. If you create the + * following mounts: + * + * mount /dev/sda1 /mnt + * mount /dev/sda2 /mnt + * mount /dev/sda3 /mnt + * + * Then lookup_mnt() on the base /mnt dentry in the root mount will + * return successively the root dentry and vfsmount of /dev/sda1, then + * /dev/sda2, then /dev/sda3, then NULL. + * + * lookup_mnt takes a reference to the found vfsmount. */ struct vfsmount *lookup_mnt(struct path *path) { @@ -621,21 +633,6 @@ static void attach_mnt(struct mount *mnt, struct path *path) list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts); } -static inline void __mnt_make_longterm(struct mount *mnt) -{ -#ifdef CONFIG_SMP - atomic_inc(&mnt->mnt_longterm); -#endif -} - -/* needs vfsmount lock for write */ -static inline void __mnt_make_shortterm(struct mount *mnt) -{ -#ifdef CONFIG_SMP - atomic_dec(&mnt->mnt_longterm); -#endif -} - /* * vfsmount lock must be held for write */ @@ -649,10 +646,8 @@ static void commit_tree(struct mount *mnt) BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); - list_for_each_entry(m, &head, mnt_list) { + list_for_each_entry(m, &head, mnt_list) m->mnt_ns = n; - __mnt_make_longterm(m); - } list_splice(&head, n->list.prev); @@ -725,56 +720,60 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { struct super_block *sb = old->mnt.mnt_sb; - struct mount *mnt = alloc_vfsmnt(old->mnt_devname); + struct mount *mnt; + int err; - if (mnt) { - if (flag & (CL_SLAVE | CL_PRIVATE)) - mnt->mnt_group_id = 0; /* not a peer of original */ - else - mnt->mnt_group_id = old->mnt_group_id; - - if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { - int err = mnt_alloc_group_id(mnt); - if (err) - goto out_free; - } + mnt = alloc_vfsmnt(old->mnt_devname); + if (!mnt) + return ERR_PTR(-ENOMEM); - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; - atomic_inc(&sb->s_active); - mnt->mnt.mnt_sb = sb; - mnt->mnt.mnt_root = dget(root); - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; - br_write_lock(&vfsmount_lock); - list_add_tail(&mnt->mnt_instance, &sb->s_mounts); - br_write_unlock(&vfsmount_lock); + if (flag & (CL_SLAVE | CL_PRIVATE)) + mnt->mnt_group_id = 0; /* not a peer of original */ + else + mnt->mnt_group_id = old->mnt_group_id; - if (flag & CL_SLAVE) { - list_add(&mnt->mnt_slave, &old->mnt_slave_list); - mnt->mnt_master = old; - CLEAR_MNT_SHARED(mnt); - } else if (!(flag & CL_PRIVATE)) { - if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) - list_add(&mnt->mnt_share, &old->mnt_share); - if (IS_MNT_SLAVE(old)) - list_add(&mnt->mnt_slave, &old->mnt_slave); - mnt->mnt_master = old->mnt_master; - } - if (flag & CL_MAKE_SHARED) - set_mnt_shared(mnt); - - /* stick the duplicate mount on the same expiry list - * as the original if that was on one */ - if (flag & CL_EXPIRE) { - if (!list_empty(&old->mnt_expire)) - list_add(&mnt->mnt_expire, &old->mnt_expire); - } + if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { + err = mnt_alloc_group_id(mnt); + if (err) + goto out_free; } + + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + atomic_inc(&sb->s_active); + mnt->mnt.mnt_sb = sb; + mnt->mnt.mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; + br_write_lock(&vfsmount_lock); + list_add_tail(&mnt->mnt_instance, &sb->s_mounts); + br_write_unlock(&vfsmount_lock); + + if (flag & CL_SLAVE) { + list_add(&mnt->mnt_slave, &old->mnt_slave_list); + mnt->mnt_master = old; + CLEAR_MNT_SHARED(mnt); + } else if (!(flag & CL_PRIVATE)) { + if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) + list_add(&mnt->mnt_share, &old->mnt_share); + if (IS_MNT_SLAVE(old)) + list_add(&mnt->mnt_slave, &old->mnt_slave); + mnt->mnt_master = old->mnt_master; + } + if (flag & CL_MAKE_SHARED) + set_mnt_shared(mnt); + + /* stick the duplicate mount on the same expiry list + * as the original if that was on one */ + if (flag & CL_EXPIRE) { + if (!list_empty(&old->mnt_expire)) + list_add(&mnt->mnt_expire, &old->mnt_expire); + } + return mnt; out_free: free_vfsmnt(mnt); - return NULL; + return ERR_PTR(err); } static inline void mntfree(struct mount *mnt) @@ -804,7 +803,8 @@ static void mntput_no_expire(struct mount *mnt) put_again: #ifdef CONFIG_SMP br_read_lock(&vfsmount_lock); - if (likely(atomic_read(&mnt->mnt_longterm))) { + if (likely(mnt->mnt_ns)) { + /* shouldn't be the last one */ mnt_add_count(mnt, -1); br_read_unlock(&vfsmount_lock); return; @@ -939,7 +939,7 @@ EXPORT_SYMBOL(replace_mount_options); /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct proc_mounts *p = container_of(m, struct proc_mounts, m); + struct proc_mounts *p = proc_mounts(m); down_read(&namespace_sem); return seq_list_start(&p->ns->list, *pos); @@ -947,7 +947,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = container_of(m, struct proc_mounts, m); + struct proc_mounts *p = proc_mounts(m); return seq_list_next(v, &p->ns->list, pos); } @@ -959,7 +959,7 @@ static void m_stop(struct seq_file *m, void *v) static int m_show(struct seq_file *m, void *v) { - struct proc_mounts *p = container_of(m, struct proc_mounts, m); + struct proc_mounts *p = proc_mounts(m); struct mount *r = list_entry(v, struct mount, mnt_list); return p->show(m, &r->mnt); } @@ -1074,8 +1074,6 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); - if (p->mnt_ns) - __mnt_make_shortterm(p); p->mnt_ns = NULL; list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { @@ -1260,11 +1258,12 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, struct path path; if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) - return NULL; + return ERR_PTR(-EINVAL); res = q = clone_mnt(mnt, dentry, flag); - if (!q) - goto Enomem; + if (IS_ERR(q)) + return q; + q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; @@ -1286,8 +1285,8 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, path.mnt = &q->mnt; path.dentry = p->mnt_mountpoint; q = clone_mnt(p, p->mnt.mnt_root, flag); - if (!q) - goto Enomem; + if (IS_ERR(q)) + goto out; br_write_lock(&vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, &path); @@ -1295,7 +1294,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, } } return res; -Enomem: +out: if (res) { LIST_HEAD(umount_list); br_write_lock(&vfsmount_lock); @@ -1303,9 +1302,11 @@ Enomem: br_write_unlock(&vfsmount_lock); release_mounts(&umount_list); } - return NULL; + return q; } +/* Caller should check returned pointer for errors */ + struct vfsmount *collect_mounts(struct path *path) { struct mount *tree; @@ -1313,7 +1314,9 @@ struct vfsmount *collect_mounts(struct path *path) tree = copy_tree(real_mount(path->mnt), path->dentry, CL_COPY_ALL | CL_PRIVATE); up_write(&namespace_sem); - return tree ? &tree->mnt : NULL; + if (IS_ERR(tree)) + return NULL; + return &tree->mnt; } void drop_collected_mounts(struct vfsmount *mnt) @@ -1608,14 +1611,15 @@ static int do_loopback(struct path *path, char *old_name, if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old)) goto out2; - err = -ENOMEM; if (recurse) mnt = copy_tree(old, old_path.dentry, 0); else mnt = clone_mnt(old, old_path.dentry, 0); - if (!mnt) - goto out2; + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); + goto out; + } err = graft_tree(mnt, path); if (err) { @@ -2209,23 +2213,6 @@ static struct mnt_namespace *alloc_mnt_ns(void) return new_ns; } -void mnt_make_longterm(struct vfsmount *mnt) -{ - __mnt_make_longterm(real_mount(mnt)); -} - -void mnt_make_shortterm(struct vfsmount *m) -{ -#ifdef CONFIG_SMP - struct mount *mnt = real_mount(m); - if (atomic_add_unless(&mnt->mnt_longterm, -1, 1)) - return; - br_write_lock(&vfsmount_lock); - atomic_dec(&mnt->mnt_longterm); - br_write_unlock(&vfsmount_lock); -#endif -} - /* * Allocate a new namespace structure and populate it with contents * copied from the namespace of the passed in task structure. @@ -2246,10 +2233,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, down_write(&namespace_sem); /* First pass: copy the tree topology */ new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); - if (!new) { + if (IS_ERR(new)) { up_write(&namespace_sem); kfree(new_ns); - return ERR_PTR(-ENOMEM); + return ERR_CAST(new); } new_ns->root = new; br_write_lock(&vfsmount_lock); @@ -2265,18 +2252,13 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, q = new; while (p) { q->mnt_ns = new_ns; - __mnt_make_longterm(q); if (fs) { if (&p->mnt == fs->root.mnt) { fs->root.mnt = mntget(&q->mnt); - __mnt_make_longterm(q); - mnt_make_shortterm(&p->mnt); rootmnt = &p->mnt; } if (&p->mnt == fs->pwd.mnt) { fs->pwd.mnt = mntget(&q->mnt); - __mnt_make_longterm(q); - mnt_make_shortterm(&p->mnt); pwdmnt = &p->mnt; } } @@ -2320,7 +2302,6 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) if (!IS_ERR(new_ns)) { struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; - __mnt_make_longterm(mnt); new_ns->root = mnt; list_add(&new_ns->list, &mnt->mnt_list); } else { @@ -2615,7 +2596,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) * it is a longterm mount, don't release mnt until * we unmount before file sys is unregistered */ - mnt_make_longterm(mnt); + real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; } return mnt; } @@ -2625,7 +2606,9 @@ void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ if (!IS_ERR_OR_NULL(mnt)) { - mnt_make_shortterm(mnt); + br_write_lock(&vfsmount_lock); + real_mount(mnt)->mnt_ns = NULL; + br_write_unlock(&vfsmount_lock); mntput(mnt); } } diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index aeed93a6bde..4117e7b377b 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -30,8 +30,8 @@ static void ncp_do_readdir(struct file *, void *, filldir_t, static int ncp_readdir(struct file *, void *, filldir_t); -static int ncp_create(struct inode *, struct dentry *, umode_t, struct nameidata *); -static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *); +static int ncp_create(struct inode *, struct dentry *, umode_t, bool); +static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int); static int ncp_unlink(struct inode *, struct dentry *); static int ncp_mkdir(struct inode *, struct dentry *, umode_t); static int ncp_rmdir(struct inode *, struct dentry *); @@ -72,7 +72,7 @@ const struct inode_operations ncp_dir_inode_operations = /* * Dentry operations routines */ -static int ncp_lookup_validate(struct dentry *, struct nameidata *); +static int ncp_lookup_validate(struct dentry *, unsigned int); static int ncp_hash_dentry(const struct dentry *, const struct inode *, struct qstr *); static int ncp_compare_dentry(const struct dentry *, const struct inode *, @@ -290,7 +290,7 @@ leave_me:; static int -ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd) +ncp_lookup_validate(struct dentry *dentry, unsigned int flags) { struct ncp_server *server; struct dentry *parent; @@ -302,7 +302,7 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd) if (dentry == dentry->d_sb->s_root) return 1; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; parent = dget_parent(dentry); @@ -836,7 +836,7 @@ out: return result; } -static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ncp_server *server = NCP_SERVER(dir); struct inode *inode = NULL; @@ -980,7 +980,7 @@ out: } static int ncp_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { return ncp_create_new(dir, dentry, mode, 0, 0); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f430057ff3b..a6b1c7fb823 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -46,8 +46,8 @@ static int nfs_opendir(struct inode *, struct file *); static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, void *, filldir_t); -static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *); +static struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); +static int nfs_create(struct inode *, struct dentry *, umode_t, bool); static int nfs_mkdir(struct inode *, struct dentry *, umode_t); static int nfs_rmdir(struct inode *, struct dentry *); static int nfs_unlink(struct inode *, struct dentry *); @@ -111,11 +111,13 @@ const struct inode_operations nfs3_dir_inode_operations = { #ifdef CONFIG_NFS_V4 -static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_open_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd); +static int nfs_atomic_open(struct inode *, struct dentry *, + struct file *, unsigned, umode_t, + int *); const struct inode_operations nfs4_dir_inode_operations = { - .create = nfs_open_create, - .lookup = nfs_atomic_lookup, + .create = nfs_create, + .lookup = nfs_lookup, + .atomic_open = nfs_atomic_open, .link = nfs_link, .unlink = nfs_unlink, .symlink = nfs_symlink, @@ -1029,27 +1031,14 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) } /* - * Return the intent data that applies to this particular path component - * - * Note that the current set of intents only apply to the very last - * component of the path and none of them is set before that last - * component. - */ -static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, - unsigned int mask) -{ - return nd->flags & mask; -} - -/* * Use intent information to check whether or not we're going to do * an O_EXCL create using this path component. */ -static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) +static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) { if (NFS_PROTO(dir)->version == 2) return 0; - return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL); + return flags & LOOKUP_EXCL; } /* @@ -1061,25 +1050,20 @@ static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) * */ static inline -int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd) +int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) { struct nfs_server *server = NFS_SERVER(inode); if (IS_AUTOMOUNT(inode)) return 0; - if (nd != NULL) { - /* VFS wants an on-the-wire revalidation */ - if (nd->flags & LOOKUP_REVAL) - goto out_force; - /* This is an open(2) */ - if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 && - !(server->flags & NFS_MOUNT_NOCTO) && - (S_ISREG(inode->i_mode) || - S_ISDIR(inode->i_mode))) - goto out_force; - return 0; - } - return nfs_revalidate_inode(server, inode); + /* VFS wants an on-the-wire revalidation */ + if (flags & LOOKUP_REVAL) + goto out_force; + /* This is an open(2) */ + if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + goto out_force; + return 0; out_force: return __nfs_revalidate_inode(server, inode); } @@ -1093,10 +1077,10 @@ out_force: */ static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { /* Don't revalidate a negative dentry if we're creating a new file */ - if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) + if (flags & LOOKUP_CREATE) return 0; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) return 1; @@ -1114,7 +1098,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ -static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) +static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *dir; struct inode *inode; @@ -1123,7 +1107,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) struct nfs_fattr *fattr = NULL; int error; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; parent = dget_parent(dentry); @@ -1132,7 +1116,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) inode = dentry->d_inode; if (!inode) { - if (nfs_neg_need_reval(dir, dentry, nd)) + if (nfs_neg_need_reval(dir, dentry, flags)) goto out_bad; goto out_valid_noent; } @@ -1148,8 +1132,8 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) goto out_set_verifier; /* Force a full look up iff the parent directory has changed */ - if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) { - if (nfs_lookup_verify_inode(inode, nd)) + if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) { + if (nfs_lookup_verify_inode(inode, flags)) goto out_zap_parent; goto out_valid; } @@ -1286,7 +1270,7 @@ const struct dentry_operations nfs_dentry_operations = { .d_release = nfs_d_release, }; -static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *res; struct dentry *parent; @@ -1307,7 +1291,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. */ - if (nfs_is_exclusive_create(dir, nd)) { + if (nfs_is_exclusive_create(dir, flags)) { d_instantiate(dentry, NULL); res = NULL; goto out; @@ -1354,7 +1338,7 @@ out: } #ifdef CONFIG_NFS_V4 -static int nfs4_lookup_revalidate(struct dentry *, struct nameidata *); +static int nfs4_lookup_revalidate(struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, @@ -1364,24 +1348,6 @@ const struct dentry_operations nfs4_dentry_operations = { .d_release = nfs_d_release, }; -/* - * Use intent information to determine whether we need to substitute - * the NFSv4-style stateful OPEN for the LOOKUP call - */ -static int is_atomic_open(struct nameidata *nd) -{ - if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0) - return 0; - /* NFS does not (yet) have a stateful open for directories */ - if (nd->flags & LOOKUP_DIRECTORY) - return 0; - /* Are we trying to write to a read only partition? */ - if (__mnt_is_readonly(nd->path.mnt) && - (nd->intent.open.flags & (O_CREAT|O_TRUNC|O_ACCMODE))) - return 0; - return 1; -} - static fmode_t flags_to_mode(int flags) { fmode_t res = (__force fmode_t)flags & FMODE_EXEC; @@ -1403,136 +1369,143 @@ static int do_open(struct inode *inode, struct file *filp) return 0; } -static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx) +static int nfs_finish_open(struct nfs_open_context *ctx, + struct dentry *dentry, + struct file *file, unsigned open_flags, + int *opened) { - struct file *filp; - int ret = 0; + int err; + + if (ctx->dentry != dentry) { + dput(ctx->dentry); + ctx->dentry = dget(dentry); + } /* If the open_intent is for execute, we have an extra check to make */ if (ctx->mode & FMODE_EXEC) { - ret = nfs_may_open(ctx->dentry->d_inode, - ctx->cred, - nd->intent.open.flags); - if (ret < 0) + err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags); + if (err < 0) goto out; } - filp = lookup_instantiate_filp(nd, ctx->dentry, do_open); - if (IS_ERR(filp)) - ret = PTR_ERR(filp); - else - nfs_file_set_open_context(filp, ctx); + + err = finish_open(file, dentry, do_open, opened); + if (err) + goto out; + nfs_file_set_open_context(file, ctx); + out: put_nfs_open_context(ctx); - return ret; + return err; } -static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static int nfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode, int *opened) { struct nfs_open_context *ctx; - struct iattr attr; - struct dentry *res = NULL; + struct dentry *res; + struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; - int open_flags; int err; - dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", + /* Expect a negative dentry */ + BUG_ON(dentry->d_inode); + + dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - /* Check that we are indeed trying to open this file */ - if (!is_atomic_open(nd)) + /* NFS only supports OPEN on regular files */ + if ((open_flags & O_DIRECTORY)) { + if (!d_unhashed(dentry)) { + /* + * Hashed negative dentry with O_DIRECTORY: dentry was + * revalidated and is fine, no need to perform lookup + * again + */ + return -ENOENT; + } goto no_open; - - if (dentry->d_name.len > NFS_SERVER(dir)->namelen) { - res = ERR_PTR(-ENAMETOOLONG); - goto out; - } - - /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash - * the dentry. */ - if (nd->flags & LOOKUP_EXCL) { - d_instantiate(dentry, NULL); - goto out; } - open_flags = nd->intent.open.flags; - attr.ia_valid = ATTR_OPEN; - - ctx = create_nfs_open_context(dentry, open_flags); - res = ERR_CAST(ctx); - if (IS_ERR(ctx)) - goto out; + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + return -ENAMETOOLONG; - if (nd->flags & LOOKUP_CREATE) { - attr.ia_mode = nd->intent.open.create_mode; + if (open_flags & O_CREAT) { attr.ia_valid |= ATTR_MODE; - attr.ia_mode &= ~current_umask(); - } else - open_flags &= ~(O_EXCL | O_CREAT); - + attr.ia_mode = mode & ~current_umask(); + } if (open_flags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; } - /* Open the file on the server */ + ctx = create_nfs_open_context(dentry, open_flags); + err = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; + nfs_block_sillyrename(dentry->d_parent); inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); + d_drop(dentry); if (IS_ERR(inode)) { nfs_unblock_sillyrename(dentry->d_parent); put_nfs_open_context(ctx); - switch (PTR_ERR(inode)) { - /* Make a negative dentry */ - case -ENOENT: - d_add(dentry, NULL); - res = NULL; - goto out; - /* This turned out not to be a regular file */ - case -EISDIR: - case -ENOTDIR: + err = PTR_ERR(inode); + switch (err) { + case -ENOENT: + d_add(dentry, NULL); + break; + case -EISDIR: + case -ENOTDIR: + goto no_open; + case -ELOOP: + if (!(open_flags & O_NOFOLLOW)) goto no_open; - case -ELOOP: - if (!(nd->intent.open.flags & O_NOFOLLOW)) - goto no_open; + break; /* case -EINVAL: */ - default: - res = ERR_CAST(inode); - goto out; + default: + break; } + goto out; } res = d_add_unique(dentry, inode); - nfs_unblock_sillyrename(dentry->d_parent); - if (res != NULL) { - dput(ctx->dentry); - ctx->dentry = dget(res); + if (res != NULL) dentry = res; - } - err = nfs_intent_set_file(nd, ctx); - if (err < 0) { - if (res != NULL) - dput(res); - return ERR_PTR(err); - } -out: + + nfs_unblock_sillyrename(dentry->d_parent); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - return res; + + err = nfs_finish_open(ctx, dentry, file, open_flags, opened); + + dput(res); +out: + return err; + no_open: - return nfs_lookup(dir, dentry, nd); + res = nfs_lookup(dir, dentry, 0); + err = PTR_ERR(res); + if (IS_ERR(res)) + goto out; + + return finish_no_open(file, res); } -static int nfs4_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) +static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *parent = NULL; struct inode *inode; struct inode *dir; - int openflags, ret = 0; + int ret = 0; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; - if (!is_atomic_open(nd) || d_mountpoint(dentry)) + if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) + goto no_open; + if (d_mountpoint(dentry)) goto no_open; + inode = dentry->d_inode; parent = dget_parent(dentry); dir = parent->d_inode; @@ -1540,7 +1513,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) * optimize away revalidation of negative dentries. */ if (inode == NULL) { - if (!nfs_neg_need_reval(dir, dentry, nd)) + if (!nfs_neg_need_reval(dir, dentry, flags)) ret = 1; goto out; } @@ -1548,9 +1521,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) goto no_open_dput; - openflags = nd->intent.open.flags; /* We cannot do exclusive creation on a positive dentry */ - if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) + if (flags & LOOKUP_EXCL) goto no_open_dput; /* Let f_op->open() actually open (and revalidate) the file */ @@ -1563,48 +1535,7 @@ out: no_open_dput: dput(parent); no_open: - return nfs_lookup_revalidate(dentry, nd); -} - -static int nfs_open_create(struct inode *dir, struct dentry *dentry, - umode_t mode, struct nameidata *nd) -{ - struct nfs_open_context *ctx = NULL; - struct iattr attr; - int error; - int open_flags = O_CREAT|O_EXCL; - - dfprintk(VFS, "NFS: create(%s/%ld), %s\n", - dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - - attr.ia_mode = mode; - attr.ia_valid = ATTR_MODE; - - if (nd) - open_flags = nd->intent.open.flags; - - ctx = create_nfs_open_context(dentry, open_flags); - error = PTR_ERR(ctx); - if (IS_ERR(ctx)) - goto out_err_drop; - - error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx); - if (error != 0) - goto out_put_ctx; - if (nd) { - error = nfs_intent_set_file(nd, ctx); - if (error < 0) - goto out_err; - } else { - put_nfs_open_context(ctx); - } - return 0; -out_put_ctx: - put_nfs_open_context(ctx); -out_err_drop: - d_drop(dentry); -out_err: - return error; + return nfs_lookup_revalidate(dentry, flags); } #endif /* CONFIG_NFSV4 */ @@ -1658,11 +1589,11 @@ out_error: * reply path made it appear to have failed. */ static int nfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, struct nameidata *nd) + umode_t mode, bool excl) { struct iattr attr; + int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; int error; - int open_flags = O_CREAT|O_EXCL; dfprintk(VFS, "NFS: create(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1670,10 +1601,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - if (nd) - open_flags = nd->intent.open.flags; - - error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL); + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); if (error != 0) goto out_err; return 0; diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 8abfb19bd3a..a67990f90bd 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -62,7 +62,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i */ spin_lock(&sb->s_root->d_inode->i_lock); spin_lock(&sb->s_root->d_lock); - list_del_init(&sb->s_root->d_alias); + hlist_del_init(&sb->s_root->d_alias); spin_unlock(&sb->s_root->d_lock); spin_unlock(&sb->s_root->d_inode->i_lock); } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2292a0fd2bf..3187e24e8f7 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -314,7 +314,7 @@ static void nfs3_free_createdata(struct nfs3_createdata *data) */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nfs_open_context *ctx) + int flags) { struct nfs3_createdata *data; umode_t mode = sattr->ia_mode; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 15fc7e4664e..c157b2089b4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2806,37 +2806,22 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page, } /* - * Got race? - * We will need to arrange for the VFS layer to provide an atomic open. - * Until then, this create/open method is prone to inefficiency and race - * conditions due to the lookup, create, and open VFS calls from sys_open() - * placed on the wire. - * - * Given the above sorry state of affairs, I'm simply sending an OPEN. - * The file will be opened again in the subsequent VFS open call - * (nfs4_proc_file_open). - * - * The open for read will just hang around to be used by any process that - * opens the file O_RDONLY. This will all be resolved with the VFS changes. + * This is just for mknod. open(O_CREAT) will always do ->open_context(). */ - static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nfs_open_context *ctx) + int flags) { - struct dentry *de = dentry; + struct nfs_open_context *ctx; struct nfs4_state *state; - struct rpc_cred *cred = NULL; - fmode_t fmode = 0; int status = 0; - if (ctx != NULL) { - cred = ctx->cred; - de = ctx->dentry; - fmode = ctx->mode; - } + ctx = alloc_nfs_open_context(dentry, FMODE_READ); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, de, fmode, flags, sattr, cred, NULL); + state = nfs4_do_open(dir, dentry, ctx->mode, flags, sattr, ctx->cred, NULL); d_drop(dentry); if (IS_ERR(state)) { status = PTR_ERR(state); @@ -2844,11 +2829,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, } d_add(dentry, igrab(state->inode)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - if (ctx != NULL) - ctx->state = state; - else - nfs4_close_sync(state, fmode); + ctx->state = state; out: + put_nfs_open_context(ctx); return status; } diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index b47277baeba..f50d3e8d6f2 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -454,7 +454,10 @@ int objio_read_pagelist(struct nfs_read_data *rdata) objios->ios->done = _read_done; dprintk("%s: offset=0x%llx length=0x%x\n", __func__, rdata->args.offset, rdata->args.count); - return ore_read(objios->ios); + ret = ore_read(objios->ios); + if (unlikely(ret)) + objio_free_result(&objios->oir); + return ret; } /* @@ -486,8 +489,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) struct nfs_write_data *wdata = objios->oir.rpcdata; struct address_space *mapping = wdata->header->inode->i_mapping; pgoff_t index = offset / PAGE_SIZE; - struct page *page = find_get_page(mapping, index); + struct page *page; + loff_t i_size = i_size_read(wdata->header->inode); + + if (offset >= i_size) { + *uptodate = true; + dprintk("%s: g_zero_page index=0x%lx\n", __func__, index); + return ZERO_PAGE(0); + } + page = find_get_page(mapping, index); if (!page) { page = find_or_create_page(mapping, index, GFP_NOFS); if (unlikely(!page)) { @@ -507,8 +518,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) static void __r4w_put_page(void *priv, struct page *page) { - dprintk("%s: index=0x%lx\n", __func__, page->index); - page_cache_release(page); + dprintk("%s: index=0x%lx\n", __func__, + (page == ZERO_PAGE(0)) ? -1UL : page->index); + if (ZERO_PAGE(0) != page) + page_cache_release(page); return; } @@ -539,8 +552,10 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how) dprintk("%s: offset=0x%llx length=0x%x\n", __func__, wdata->args.offset, wdata->args.count); ret = ore_write(objios->ios); - if (unlikely(ret)) + if (unlikely(ret)) { + objio_free_result(&objios->oir); return ret; + } if (objios->sync) _write_done(objios->ios, objios); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 617c7419a08..4433806e116 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -259,7 +259,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data) static int nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nfs_open_context *ctx) + int flags) { struct nfs_createdata *data; struct rpc_message msg = { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 06228192f64..8b2a2977b72 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2419,7 +2419,7 @@ static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type, sb_mntdata.mntflags |= MS_SYNCHRONOUS; /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); + s = sget(fs_type, compare_super, nfs_set_super, flags, &sb_mntdata); if (IS_ERR(s)) { mntroot = ERR_CAST(s); goto out_err_nosb; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c8bd9c3be7f..4700a0a929d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -745,7 +745,7 @@ __be32 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { - struct dentry *dentry; + struct path path; struct inode *inode; int flags = O_RDONLY|O_LARGEFILE; __be32 err; @@ -762,8 +762,9 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, if (err) goto out; - dentry = fhp->fh_dentry; - inode = dentry->d_inode; + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = fhp->fh_dentry; + inode = path.dentry->d_inode; /* Disallow write access to files with the append-only bit set * or any access when mandatory locking enabled @@ -792,8 +793,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, else flags = O_WRONLY|O_LARGEFILE; } - *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), - flags, current_cred()); + *filp = dentry_open(&path, flags, current_cred()); if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); else { @@ -1329,7 +1329,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); + host_err = vfs_create(dirp, dchild, iap->ia_mode, true); if (!host_err) nfsd_check_ignore_resizing(iap); break; @@ -1492,7 +1492,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); + host_err = vfs_create(dirp, dchild, iap->ia_mode, true); if (host_err < 0) { fh_drop_write(fhp); goto out_nfserr; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index b72847988b7..1d0c0b84c5a 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -63,7 +63,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) */ static struct dentry * -nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; ino_t ino; @@ -85,7 +85,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) * with d_instantiate(). */ static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct inode *inode; struct nilfs_transaction_info ti; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 1099a76cee5..d57c42f974e 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1288,7 +1288,8 @@ nilfs_mount(struct file_system_type *fs_type, int flags, err = -EBUSY; goto failed; } - s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev); + s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags, + sd.bdev); mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) { err = PTR_ERR(s); @@ -1301,7 +1302,6 @@ nilfs_mount(struct file_system_type *fs_type, int flags, s_new = true; /* New superblock instance created */ - s->s_flags = flags; s->s_mode = mode; strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(sd.bdev)); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 3568c8a8b13..d4380366973 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -61,8 +61,6 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) { int client_fd; - struct dentry *dentry; - struct vfsmount *mnt; struct file *new_file; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -81,12 +79,10 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) * we need a new file handle for the userspace program so it can read even if it was * originally opened O_WRONLY. */ - dentry = dget(event->path.dentry); - mnt = mntget(event->path.mnt); /* it's possible this event was an overflow event. in that case dentry and mnt * are NULL; That's fine, just don't call dentry open */ - if (dentry && mnt) - new_file = dentry_open(dentry, mnt, + if (event->path.dentry && event->path.mnt) + new_file = dentry_open(&event->path, group->fanotify_data.f_flags | FMODE_NONOTIFY, current_cred()); else diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index b39c5c161ad..6baadb5a843 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -52,6 +52,7 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) void __fsnotify_update_child_dentry_flags(struct inode *inode) { struct dentry *alias; + struct hlist_node *p; int watched; if (!S_ISDIR(inode->i_mode)) @@ -63,7 +64,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ - list_for_each_entry(alias, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 358273e59ad..436f36037e0 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -101,7 +101,7 @@ * Locking: Caller must hold i_mutex on the directory. */ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, - struct nameidata *nd) + unsigned int flags) { ntfs_volume *vol = NTFS_SB(dir_ino->i_sb); struct inode *dent_inode; diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index e5ba3481833..8db4b58b2e4 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -49,14 +49,13 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry) } -static int ocfs2_dentry_revalidate(struct dentry *dentry, - struct nameidata *nd) +static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; int ret = 0; /* if all else fails, just return false */ struct ocfs2_super *osb; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; @@ -170,13 +169,11 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed) { - struct list_head *p; - struct dentry *dentry = NULL; + struct hlist_node *p; + struct dentry *dentry; spin_lock(&inode->i_lock); - list_for_each(p, &inode->i_dentry) { - dentry = list_entry(p, struct dentry, d_alias); - + hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { trace_ocfs2_find_local_alias(dentry->d_name.len, @@ -184,16 +181,13 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, dget_dlock(dentry); spin_unlock(&dentry->d_lock); - break; + spin_unlock(&inode->i_lock); + return dentry; } spin_unlock(&dentry->d_lock); - - dentry = NULL; } - spin_unlock(&inode->i_lock); - - return dentry; + return NULL; } DEFINE_SPINLOCK(dentry_attach_lock); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index e31d6ae013a..83b6f98e066 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -526,7 +526,7 @@ bail: static int dlmfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { int status = 0; struct inode *inode; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 9f39c640cdd..f1fd0741162 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -98,7 +98,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { int status; u64 blkno; @@ -618,7 +618,7 @@ static int ocfs2_mkdir(struct inode *dir, static int ocfs2_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { int ret; diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index f00576ec320..fb5b3ff79dc 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -285,13 +285,13 @@ static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { return omfs_add_node(dir, dentry, mode | S_IFREG); } static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct buffer_head *bh; struct inode *inode = NULL; diff --git a/fs/open.c b/fs/open.c index 1540632d838..1e914b397e1 100644 --- a/fs/open.c +++ b/fs/open.c @@ -537,25 +537,6 @@ static int chown_common(struct path *path, uid_t user, gid_t group) return error; } -SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) -{ - struct path path; - int error; - - error = user_path(filename, &path); - if (error) - goto out; - error = mnt_want_write(path.mnt); - if (error) - goto out_release; - error = chown_common(&path, user, group); - mnt_drop_write(path.mnt); -out_release: - path_put(&path); -out: - return error; -} - SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) { @@ -583,23 +564,15 @@ out: return error; } -SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) +SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { - struct path path; - int error; + return sys_fchownat(AT_FDCWD, filename, user, group, 0); +} - error = user_lpath(filename, &path); - if (error) - goto out; - error = mnt_want_write(path.mnt); - if (error) - goto out_release; - error = chown_common(&path, user, group); - mnt_drop_write(path.mnt); -out_release: - path_put(&path); -out: - return error; +SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) +{ + return sys_fchownat(AT_FDCWD, filename, user, group, + AT_SYMLINK_NOFOLLOW); } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) @@ -667,10 +640,9 @@ int open_check_o_direct(struct file *f) return 0; } -static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt, - struct file *f, - int (*open)(struct inode *, struct file *), - const struct cred *cred) +static int do_dentry_open(struct file *f, + int (*open)(struct inode *, struct file *), + const struct cred *cred) { static const struct file_operations empty_fops = {}; struct inode *inode; @@ -682,9 +654,9 @@ static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt, if (unlikely(f->f_flags & O_PATH)) f->f_mode = FMODE_PATH; - inode = dentry->d_inode; + inode = f->f_path.dentry->d_inode; if (f->f_mode & FMODE_WRITE) { - error = __get_file_write_access(inode, mnt); + error = __get_file_write_access(inode, f->f_path.mnt); if (error) goto cleanup_file; if (!special_file(inode->i_mode)) @@ -692,14 +664,12 @@ static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt, } f->f_mapping = inode->i_mapping; - f->f_path.dentry = dentry; - f->f_path.mnt = mnt; f->f_pos = 0; file_sb_list_add(f, inode->i_sb); if (unlikely(f->f_mode & FMODE_PATH)) { f->f_op = &empty_fops; - return f; + return 0; } f->f_op = fops_get(inode->i_fop); @@ -726,10 +696,11 @@ static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt, file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - return f; + return 0; cleanup_all: fops_put(f->f_op); + file_sb_list_del(f); if (f->f_mode & FMODE_WRITE) { put_write_access(inode); if (!special_file(inode->i_mode)) { @@ -740,124 +711,62 @@ cleanup_all: * here, so just reset the state. */ file_reset_write(f); - mnt_drop_write(mnt); + mnt_drop_write(f->f_path.mnt); } } - file_sb_list_del(f); - f->f_path.dentry = NULL; - f->f_path.mnt = NULL; cleanup_file: - dput(dentry); - mntput(mnt); - return ERR_PTR(error); -} - -static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - struct file *f, - int (*open)(struct inode *, struct file *), - const struct cred *cred) -{ - struct file *res = do_dentry_open(dentry, mnt, f, open, cred); - if (!IS_ERR(res)) { - int error = open_check_o_direct(f); - if (error) { - fput(res); - res = ERR_PTR(error); - } - } else { - put_filp(f); - } - return res; + path_put(&f->f_path); + f->f_path.mnt = NULL; + f->f_path.dentry = NULL; + return error; } /** - * lookup_instantiate_filp - instantiates the open intent filp - * @nd: pointer to nameidata + * finish_open - finish opening a file + * @od: opaque open data * @dentry: pointer to dentry * @open: open callback * - * Helper for filesystems that want to use lookup open intents and pass back - * a fully instantiated struct file to the caller. - * This function is meant to be called from within a filesystem's - * lookup method. - * Beware of calling it for non-regular files! Those ->open methods might block - * (e.g. in fifo_open), leaving you with parent locked (and in case of fifo, - * leading to a deadlock, as nobody can open that fifo anymore, because - * another process to open fifo will block on locked parent when doing lookup). - * Note that in case of error, nd->intent.open.file is destroyed, but the - * path information remains valid. + * This can be used to finish opening a file passed to i_op->atomic_open(). + * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. */ -struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, - int (*open)(struct inode *, struct file *)) +int finish_open(struct file *file, struct dentry *dentry, + int (*open)(struct inode *, struct file *), + int *opened) { - const struct cred *cred = current_cred(); + int error; + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ - if (IS_ERR(nd->intent.open.file)) - goto out; - if (IS_ERR(dentry)) - goto out_err; - nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), - nd->intent.open.file, - open, cred); -out: - return nd->intent.open.file; -out_err: - release_open_intent(nd); - nd->intent.open.file = ERR_CAST(dentry); - goto out; + mntget(file->f_path.mnt); + file->f_path.dentry = dget(dentry); + + error = do_dentry_open(file, open, current_cred()); + if (!error) + *opened |= FILE_OPENED; + + return error; } -EXPORT_SYMBOL_GPL(lookup_instantiate_filp); +EXPORT_SYMBOL(finish_open); /** - * nameidata_to_filp - convert a nameidata to an open filp. - * @nd: pointer to nameidata - * @flags: open flags + * finish_no_open - finish ->atomic_open() without opening the file + * + * @od: opaque open data + * @dentry: dentry or NULL (as returned from ->lookup()) * - * Note that this function destroys the original nameidata + * This can be used to set the result of a successful lookup in ->atomic_open(). + * The filesystem's atomic_open() method shall return NULL after calling this. */ -struct file *nameidata_to_filp(struct nameidata *nd) +int finish_no_open(struct file *file, struct dentry *dentry) { - const struct cred *cred = current_cred(); - struct file *filp; - - /* Pick up the filp from the open intent */ - filp = nd->intent.open.file; - - /* Has the filesystem initialised the file for us? */ - if (filp->f_path.dentry != NULL) { - nd->intent.open.file = NULL; - } else { - struct file *res; - - path_get(&nd->path); - res = do_dentry_open(nd->path.dentry, nd->path.mnt, - filp, NULL, cred); - if (!IS_ERR(res)) { - int error; - - nd->intent.open.file = NULL; - BUG_ON(res != filp); - - error = open_check_o_direct(filp); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } else { - /* Allow nd->intent.open.file to be recycled */ - filp = res; - } - } - return filp; + file->f_path.dentry = dentry; + return 1; } +EXPORT_SYMBOL(finish_no_open); -/* - * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an - * error. - */ -struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, +struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; @@ -866,18 +775,28 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, validate_creds(cred); /* We must always pass in a valid mount pointer. */ - BUG_ON(!mnt); + BUG_ON(!path->mnt); error = -ENFILE; f = get_empty_filp(); - if (f == NULL) { - dput(dentry); - mntput(mnt); + if (f == NULL) return ERR_PTR(error); - } f->f_flags = flags; - return __dentry_open(dentry, mnt, f, NULL, cred); + f->f_path = *path; + path_get(&f->f_path); + error = do_dentry_open(f, NULL, cred); + if (!error) { + error = open_check_o_direct(f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } else { + put_filp(f); + f = ERR_PTR(error); + } + return f; } EXPORT_SYMBOL(dentry_open); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index bc49c975d50..4a3477949bc 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -170,13 +170,13 @@ static const struct file_operations openprom_operations = { .llseek = generic_file_llseek, }; -static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *); +static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, unsigned int); static const struct inode_operations openprom_inode_operations = { .lookup = openpromfs_lookup, }; -static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct op_inode_info *ent_oi, *oi = OP_I(dir); struct device_node *dp, *child; diff --git a/fs/pnode.c b/fs/pnode.c index bed378db075..3e000a51ac0 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -237,8 +237,9 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); - if (!(child = copy_tree(source, source->mnt.mnt_root, type))) { - ret = -ENOMEM; + child = copy_tree(source, source->mnt.mnt_root, type); + if (IS_ERR(child)) { + ret = PTR_ERR(child); list_splice(tree_list, tmp_list.prev); goto out; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 437195f204e..2772208338f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1427,16 +1427,19 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; + struct path path; int error = -EACCES; - /* We don't need a base pointer in the /proc filesystem */ - path_put(&nd->path); - /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path); + error = PROC_I(inode)->op.proc_get_link(dentry, &path); + if (error) + goto out; + + nd_jump_link(nd, &path); + return NULL; out: return ERR_PTR(error); } @@ -1601,13 +1604,13 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) * made this apply to all per process world readable and executable * directories. */ -int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; const struct cred *cred; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; @@ -1781,7 +1784,7 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) return proc_fd_info(dentry->d_inode, path, NULL); } -static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) +static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; @@ -1789,7 +1792,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) struct files_struct *files; const struct cred *cred; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; @@ -1868,7 +1871,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, NULL)) + if (tid_fd_revalidate(dentry, 0)) error = NULL; out: @@ -1956,7 +1959,7 @@ out_no_task: } static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); } @@ -2003,7 +2006,7 @@ static int dname_to_vma_addr(struct dentry *dentry, return 0; } -static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; @@ -2013,7 +2016,7 @@ static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *inode; int status = 0; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; if (!capable(CAP_SYS_ADMIN)) { @@ -2145,7 +2148,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, } static struct dentry *proc_map_files_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; @@ -2371,7 +2374,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, NULL)) + if (tid_fd_revalidate(dentry, 0)) error = NULL; out: @@ -2380,7 +2383,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, static struct dentry *proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } @@ -2430,7 +2433,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) + if (pid_revalidate(dentry, 0)) error = NULL; out: return error; @@ -2630,7 +2633,7 @@ static const struct file_operations proc_attr_dir_operations = { }; static struct dentry *proc_attr_dir_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); @@ -3114,7 +3117,8 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = default_llseek, }; -static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ +static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ return proc_pident_lookup(dir, dentry, tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } @@ -3237,13 +3241,13 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) + if (pid_revalidate(dentry, 0)) error = NULL; out: return error; } -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *result; struct task_struct *task; @@ -3470,7 +3474,8 @@ static int proc_tid_base_readdir(struct file * filp, tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); } -static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ +static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ return proc_pident_lookup(dir, dentry, tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } @@ -3508,13 +3513,13 @@ static struct dentry *proc_task_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) + if (pid_revalidate(dentry, 0)) error = NULL; out: return error; } -static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2edf34f2eb6..b3647fe6a60 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -446,7 +446,7 @@ out_unlock: } struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { return proc_lookup_de(PDE(dir), dir, dentry); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index eca4aca5b6e..e1167a1c912 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -106,7 +106,7 @@ void pde_users_dec(struct proc_dir_entry *pde); extern spinlock_t proc_subdir_lock; -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int); int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); unsigned long task_vsize(struct mm_struct *); unsigned long task_statm(struct mm_struct *, @@ -132,7 +132,7 @@ int proc_remount(struct super_block *sb, int *flags, char *data); * of the /proc/<pid> subdirectories. */ int proc_readdir(struct file *, void *, filldir_t); -struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); +struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); @@ -142,7 +142,7 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *, int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, const char *name, int len, instantiate_t instantiate, struct task_struct *task, const void *ptr); -int pid_revalidate(struct dentry *dentry, struct nameidata *nd); +int pid_revalidate(struct dentry *dentry, unsigned int flags); struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); extern const struct dentry_operations pid_dentry_operations; int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 0d9e23a39e4..b178ed733c3 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -56,7 +56,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) + if (pid_revalidate(dentry, 0)) error = NULL; out: return error; @@ -140,7 +140,7 @@ const struct file_operations proc_ns_dir_operations = { }; static struct dentry *proc_ns_dir_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { struct dentry *error; struct task_struct *task = get_proc_task(dir); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 927cbd115e5..df7dd08d439 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -101,6 +101,11 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde, { struct proc_dir_entry *ent; + if (!oldprop) { + proc_device_tree_add_prop(pde, newprop); + return; + } + for (ent = pde->subdir; ent != NULL; ent = ent->next) if (ent->data == oldprop) break; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 06e1cc17caf..fe72cd073de 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -119,7 +119,7 @@ static struct net *get_proc_task_net(struct inode *dir) } static struct dentry *proc_tgid_net_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { struct dentry *de; struct net *net; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 3476bca8f7a..dfafeb2b05a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -433,7 +433,7 @@ static struct ctl_table_header *grab_header(struct inode *inode) } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; @@ -794,9 +794,9 @@ static const struct inode_operations proc_sys_dir_operations = { .getattr = proc_sys_getattr, }; -static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) +static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) { - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; return !PROC_I(dentry->d_inode)->sysctl->unregistering; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 7c30fce037c..9a2d9fd7cad 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -111,7 +111,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, options = data; } - sb = sget(fs_type, proc_test_super, proc_set_super, ns); + sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); if (IS_ERR(sb)) return ERR_CAST(sb); @@ -121,7 +121,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } if (!sb->s_root) { - sb->s_flags = flags; err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); @@ -200,13 +199,12 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct return 0; } -static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_lookup(dir, dentry, nd)) { + if (!proc_lookup(dir, dentry, flags)) return NULL; - } - return proc_pid_lookup(dir, dentry, nd); + return proc_pid_lookup(dir, dentry, flags); } static int proc_root_readdir(struct file * filp, diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 5e289a7cbad..5fe34c355e8 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -17,7 +17,7 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) { - struct proc_mounts *p = file->private_data; + struct proc_mounts *p = proc_mounts(file->private_data); struct mnt_namespace *ns = p->ns; unsigned res = POLLIN | POLLRDNORM; @@ -121,7 +121,7 @@ out: static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = m->private; + struct proc_mounts *p = proc_mounts(m); struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -268,7 +268,6 @@ static int mounts_open_common(struct inode *inode, struct file *file, if (ret) goto err_free; - p->m.private = p; p->ns = ns; p->root = root; p->m.poll_event = ns->event; @@ -288,7 +287,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, static int mounts_release(struct inode *inode, struct file *file) { - struct proc_mounts *p = file->private_data; + struct proc_mounts *p = proc_mounts(file->private_data); path_put(&p->root); put_mnt_ns(p->ns); return seq_release(inode, file); diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 23ade2680a4..d39bb5cce88 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -12,6 +12,25 @@ config PSTORE If you don't have a platform persistent store driver, say N. +config PSTORE_CONSOLE + bool "Log kernel console messages" + depends on PSTORE + help + When the option is enabled, pstore will log all kernel + messages, even if no oops or panic happened. + +config PSTORE_FTRACE + bool "Persistent function tracer" + depends on PSTORE + depends on FUNCTION_TRACER + help + With this option kernel traces function calls into a persistent + ram buffer that can be decoded and dumped after reboot through + pstore filesystem. It can be used to determine what function + was last called before a reset or panic. + + If unsure, say N. + config PSTORE_RAM tristate "Log panic/oops to a RAM buffer" depends on PSTORE diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 278a44e0d4e..4c9095c2781 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -5,6 +5,7 @@ obj-y += pstore.o pstore-objs += inode.o platform.o +obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o ramoops-objs += ram.o ram_core.o obj-$(CONFIG_PSTORE_RAM) += ramoops.o diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c new file mode 100644 index 00000000000..a130d484b7d --- /dev/null +++ b/fs/pstore/ftrace.c @@ -0,0 +1,35 @@ +/* + * Copyright 2012 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/compiler.h> +#include <linux/irqflags.h> +#include <linux/percpu.h> +#include <linux/smp.h> +#include <linux/atomic.h> +#include <asm/barrier.h> +#include "internal.h" + +void notrace pstore_ftrace_call(unsigned long ip, unsigned long parent_ip) +{ + struct pstore_ftrace_record rec = {}; + + if (unlikely(oops_in_progress)) + return; + + rec.ip = ip; + rec.parent_ip = parent_ip; + pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); + psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, + sizeof(rec), psinfo); +} diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 11a2aa2a56c..4ab572e6d27 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -27,6 +27,7 @@ #include <linux/list.h> #include <linux/string.h> #include <linux/mount.h> +#include <linux/seq_file.h> #include <linux/ramfs.h> #include <linux/parser.h> #include <linux/sched.h> @@ -52,18 +53,117 @@ struct pstore_private { char data[]; }; +struct pstore_ftrace_seq_data { + const void *ptr; + size_t off; + size_t size; +}; + +#define REC_SIZE sizeof(struct pstore_ftrace_record) + +static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) +{ + struct pstore_private *ps = s->private; + struct pstore_ftrace_seq_data *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + + data->off = ps->size % REC_SIZE; + data->off += *pos * REC_SIZE; + if (data->off + REC_SIZE > ps->size) { + kfree(data); + return NULL; + } + + return data; + +} + +static void pstore_ftrace_seq_stop(struct seq_file *s, void *v) +{ + kfree(v); +} + +static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct pstore_private *ps = s->private; + struct pstore_ftrace_seq_data *data = v; + + data->off += REC_SIZE; + if (data->off + REC_SIZE > ps->size) + return NULL; + + (*pos)++; + return data; +} + +static int pstore_ftrace_seq_show(struct seq_file *s, void *v) +{ + struct pstore_private *ps = s->private; + struct pstore_ftrace_seq_data *data = v; + struct pstore_ftrace_record *rec = (void *)(ps->data + data->off); + + seq_printf(s, "%d %08lx %08lx %pf <- %pF\n", + pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip, + (void *)rec->ip, (void *)rec->parent_ip); + + return 0; +} + +static const struct seq_operations pstore_ftrace_seq_ops = { + .start = pstore_ftrace_seq_start, + .next = pstore_ftrace_seq_next, + .stop = pstore_ftrace_seq_stop, + .show = pstore_ftrace_seq_show, +}; + static ssize_t pstore_file_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { - struct pstore_private *ps = file->private_data; + struct seq_file *sf = file->private_data; + struct pstore_private *ps = sf->private; + if (ps->type == PSTORE_TYPE_FTRACE) + return seq_read(file, userbuf, count, ppos); return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size); } +static int pstore_file_open(struct inode *inode, struct file *file) +{ + struct pstore_private *ps = inode->i_private; + struct seq_file *sf; + int err; + const struct seq_operations *sops = NULL; + + if (ps->type == PSTORE_TYPE_FTRACE) + sops = &pstore_ftrace_seq_ops; + + err = seq_open(file, sops); + if (err < 0) + return err; + + sf = file->private_data; + sf->private = ps; + + return 0; +} + +static loff_t pstore_file_llseek(struct file *file, loff_t off, int origin) +{ + struct seq_file *sf = file->private_data; + + if (sf->op) + return seq_lseek(file, off, origin); + return default_llseek(file, off, origin); +} + static const struct file_operations pstore_file_operations = { - .open = simple_open, - .read = pstore_file_read, - .llseek = default_llseek, + .open = pstore_file_open, + .read = pstore_file_read, + .llseek = pstore_file_llseek, + .release = seq_release, }; /* @@ -212,6 +312,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, case PSTORE_TYPE_DMESG: sprintf(name, "dmesg-%s-%lld", psname, id); break; + case PSTORE_TYPE_CONSOLE: + sprintf(name, "console-%s", psname); + break; + case PSTORE_TYPE_FTRACE: + sprintf(name, "ftrace-%s", psname); + break; case PSTORE_TYPE_MCE: sprintf(name, "mce-%s-%lld", psname, id); break; diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 3bde461c3f3..0d0d3b7d5f1 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -1,6 +1,51 @@ +#ifndef __PSTORE_INTERNAL_H__ +#define __PSTORE_INTERNAL_H__ + +#include <linux/types.h> +#include <linux/time.h> +#include <linux/pstore.h> + +#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB) +#define PSTORE_CPU_IN_IP 0x1 +#elif NR_CPUS <= 4 && defined(CONFIG_ARM) +#define PSTORE_CPU_IN_IP 0x3 +#endif + +struct pstore_ftrace_record { + unsigned long ip; + unsigned long parent_ip; +#ifndef PSTORE_CPU_IN_IP + unsigned int cpu; +#endif +}; + +static inline void +pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu) +{ +#ifndef PSTORE_CPU_IN_IP + rec->cpu = cpu; +#else + rec->ip |= cpu; +#endif +} + +static inline unsigned int +pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec) +{ +#ifndef PSTORE_CPU_IN_IP + return rec->cpu; +#else + return rec->ip & PSTORE_CPU_IN_IP; +#endif +} + +extern struct pstore_info *psinfo; + extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, char *data, size_t size, struct timespec time, struct pstore_info *psi); extern int pstore_is_mounted(void); + +#endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 03ce7a9b81c..29996e8793a 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -1,6 +1,7 @@ /* * Persistent Storage - platform driver interface parts. * + * Copyright (C) 2007-2008 Google, Inc. * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com> * * This program is free software; you can redistribute it and/or modify @@ -22,6 +23,7 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/kmsg_dump.h> +#include <linux/console.h> #include <linux/module.h> #include <linux/pstore.h> #include <linux/string.h> @@ -29,6 +31,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/hardirq.h> +#include <linux/jiffies.h> #include <linux/workqueue.h> #include "internal.h" @@ -38,7 +41,12 @@ * whether the system is actually still running well enough * to let someone see the entry */ -#define PSTORE_INTERVAL (60 * HZ) +static int pstore_update_ms = -1; +module_param_named(update_ms, pstore_update_ms, int, 0600); +MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " + "(default is -1, which means runtime updates are disabled; " + "enabling this option is not safe, it may lead to further " + "corruption on Oopses)"); static int pstore_new_entry; @@ -53,7 +61,7 @@ static DECLARE_WORK(pstore_work, pstore_dowork); * calls to pstore_register() */ static DEFINE_SPINLOCK(pstore_lock); -static struct pstore_info *psinfo; +struct pstore_info *psinfo; static char *backend; @@ -146,6 +154,48 @@ static struct kmsg_dumper pstore_dumper = { .dump = pstore_dump, }; +#ifdef CONFIG_PSTORE_CONSOLE +static void pstore_console_write(struct console *con, const char *s, unsigned c) +{ + const char *e = s + c; + + while (s < e) { + unsigned long flags; + + if (c > psinfo->bufsize) + c = psinfo->bufsize; + spin_lock_irqsave(&psinfo->buf_lock, flags); + memcpy(psinfo->buf, s, c); + psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); + s += c; + c = e - s; + } +} + +static struct console pstore_console = { + .name = "pstore", + .write = pstore_console_write, + .flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME, + .index = -1, +}; + +static void pstore_register_console(void) +{ + register_console(&pstore_console); +} +#else +static void pstore_register_console(void) {} +#endif + +static int pstore_write_compat(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + size_t size, struct pstore_info *psi) +{ + return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); +} + /* * platform specific persistent storage driver registers with * us here. If pstore is already mounted, call the platform @@ -170,6 +220,8 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } + if (!psi->write) + psi->write = pstore_write_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); @@ -183,9 +235,13 @@ int pstore_register(struct pstore_info *psi) pstore_get_records(0); kmsg_dump_register(&pstore_dumper); + pstore_register_console(); - pstore_timer.expires = jiffies + PSTORE_INTERVAL; - add_timer(&pstore_timer); + if (pstore_update_ms >= 0) { + pstore_timer.expires = jiffies + + msecs_to_jiffies(pstore_update_ms); + add_timer(&pstore_timer); + } return 0; } @@ -244,7 +300,7 @@ static void pstore_timefunc(unsigned long dummy) schedule_work(&pstore_work); } - mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL); + mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); } module_param(backend, charp, 0444); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 453030f9c5b..0b311bc1891 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -25,6 +25,7 @@ #include <linux/kernel.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/version.h> #include <linux/pstore.h> #include <linux/time.h> #include <linux/io.h> @@ -41,6 +42,14 @@ module_param(record_size, ulong, 0400); MODULE_PARM_DESC(record_size, "size of each dump done on oops/panic"); +static ulong ramoops_console_size = MIN_MEM_SIZE; +module_param_named(console_size, ramoops_console_size, ulong, 0400); +MODULE_PARM_DESC(console_size, "size of kernel console log"); + +static ulong ramoops_ftrace_size = MIN_MEM_SIZE; +module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400); +MODULE_PARM_DESC(ftrace_size, "size of ftrace log"); + static ulong mem_address; module_param(mem_address, ulong, 0400); MODULE_PARM_DESC(mem_address, @@ -59,18 +68,26 @@ MODULE_PARM_DESC(dump_oops, static int ramoops_ecc; module_param_named(ecc, ramoops_ecc, int, 0600); MODULE_PARM_DESC(ramoops_ecc, - "set to 1 to enable ECC support"); + "if non-zero, the option enables ECC support and specifies " + "ECC buffer size in bytes (1 is a special value, means 16 " + "bytes ECC)"); struct ramoops_context { struct persistent_ram_zone **przs; + struct persistent_ram_zone *cprz; + struct persistent_ram_zone *fprz; phys_addr_t phys_addr; unsigned long size; size_t record_size; + size_t console_size; + size_t ftrace_size; int dump_oops; - bool ecc; - unsigned int count; - unsigned int max_count; - unsigned int read_count; + int ecc_size; + unsigned int max_dump_cnt; + unsigned int dump_write_cnt; + unsigned int dump_read_cnt; + unsigned int console_read_cnt; + unsigned int ftrace_read_cnt; struct pstore_info pstore; }; @@ -81,10 +98,38 @@ static int ramoops_pstore_open(struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; - cxt->read_count = 0; + cxt->dump_read_cnt = 0; + cxt->console_read_cnt = 0; return 0; } +static struct persistent_ram_zone * +ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, + u64 *id, + enum pstore_type_id *typep, enum pstore_type_id type, + bool update) +{ + struct persistent_ram_zone *prz; + int i = (*c)++; + + if (i >= max) + return NULL; + + prz = przs[i]; + + if (update) { + /* Update old/shadowed buffer. */ + persistent_ram_save_old(prz); + if (!persistent_ram_old_size(prz)) + return NULL; + } + + *typep = type; + *id = i; + + return prz; +} + static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, struct timespec *time, char **buf, @@ -94,20 +139,22 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz; - if (cxt->read_count >= cxt->max_count) - return -EINVAL; + prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, + cxt->max_dump_cnt, id, type, + PSTORE_TYPE_DMESG, 1); + if (!prz) + prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, + 1, id, type, PSTORE_TYPE_CONSOLE, 0); + if (!prz) + prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt, + 1, id, type, PSTORE_TYPE_FTRACE, 0); + if (!prz) + return 0; - *id = cxt->read_count++; - prz = cxt->przs[*id]; - - /* Only supports dmesg output so far. */ - *type = PSTORE_TYPE_DMESG; /* TODO(kees): Bogus time for the moment. */ time->tv_sec = 0; time->tv_nsec = 0; - /* Update old/shadowed buffer. */ - persistent_ram_save_old(prz); size = persistent_ram_old_size(prz); *buf = kmalloc(size, GFP_KERNEL); if (*buf == NULL) @@ -134,17 +181,29 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) return len; } -static int ramoops_pstore_write(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, - unsigned int part, - size_t size, struct pstore_info *psi) + +static int ramoops_pstore_write_buf(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char *buf, size_t size, + struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; - struct persistent_ram_zone *prz = cxt->przs[cxt->count]; + struct persistent_ram_zone *prz = cxt->przs[cxt->dump_write_cnt]; size_t hlen; - /* Currently ramoops is designed to only store dmesg dumps. */ + if (type == PSTORE_TYPE_CONSOLE) { + if (!cxt->cprz) + return -ENOMEM; + persistent_ram_write(cxt->cprz, buf, size); + return 0; + } else if (type == PSTORE_TYPE_FTRACE) { + if (!cxt->fprz) + return -ENOMEM; + persistent_ram_write(cxt->fprz, buf, size); + return 0; + } + if (type != PSTORE_TYPE_DMESG) return -EINVAL; @@ -170,9 +229,9 @@ static int ramoops_pstore_write(enum pstore_type_id type, hlen = ramoops_write_kmsg_hdr(prz); if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; - persistent_ram_write(prz, cxt->pstore.buf, size); + persistent_ram_write(prz, buf, size); - cxt->count = (cxt->count + 1) % cxt->max_count; + cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt; return 0; } @@ -181,12 +240,26 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; + struct persistent_ram_zone *prz; - if (id >= cxt->max_count) + switch (type) { + case PSTORE_TYPE_DMESG: + if (id >= cxt->max_dump_cnt) + return -EINVAL; + prz = cxt->przs[id]; + break; + case PSTORE_TYPE_CONSOLE: + prz = cxt->cprz; + break; + case PSTORE_TYPE_FTRACE: + prz = cxt->fprz; + break; + default: return -EINVAL; + } - persistent_ram_free_old(cxt->przs[id]); - persistent_ram_zap(cxt->przs[id]); + persistent_ram_free_old(prz); + persistent_ram_zap(prz); return 0; } @@ -197,78 +270,157 @@ static struct ramoops_context oops_cxt = { .name = "ramoops", .open = ramoops_pstore_open, .read = ramoops_pstore_read, - .write = ramoops_pstore_write, + .write_buf = ramoops_pstore_write_buf, .erase = ramoops_pstore_erase, }, }; -static int __init ramoops_probe(struct platform_device *pdev) +static void ramoops_free_przs(struct ramoops_context *cxt) +{ + int i; + + if (!cxt->przs) + return; + + for (i = 0; !IS_ERR_OR_NULL(cxt->przs[i]); i++) + persistent_ram_free(cxt->przs[i]); + kfree(cxt->przs); +} + +static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, + phys_addr_t *paddr, size_t dump_mem_sz) +{ + int err = -ENOMEM; + int i; + + if (!cxt->record_size) + return 0; + + cxt->max_dump_cnt = dump_mem_sz / cxt->record_size; + if (!cxt->max_dump_cnt) + return -ENOMEM; + + cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt, + GFP_KERNEL); + if (!cxt->przs) { + dev_err(dev, "failed to initialize a prz array for dumps\n"); + return -ENOMEM; + } + + for (i = 0; i < cxt->max_dump_cnt; i++) { + size_t sz = cxt->record_size; + + cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, cxt->ecc_size); + if (IS_ERR(cxt->przs[i])) { + err = PTR_ERR(cxt->przs[i]); + dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", + sz, (unsigned long long)*paddr, err); + goto fail_prz; + } + *paddr += sz; + } + + return 0; +fail_prz: + ramoops_free_przs(cxt); + return err; +} + +static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, + struct persistent_ram_zone **prz, + phys_addr_t *paddr, size_t sz, u32 sig) +{ + if (!sz) + return 0; + + if (*paddr + sz > *paddr + cxt->size) + return -ENOMEM; + + *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size); + if (IS_ERR(*prz)) { + int err = PTR_ERR(*prz); + + dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", + sz, (unsigned long long)*paddr, err); + return err; + } + + persistent_ram_zap(*prz); + + *paddr += sz; + + return 0; +} + +static int __devinit ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = pdev->dev.platform_data; struct ramoops_context *cxt = &oops_cxt; + size_t dump_mem_sz; + phys_addr_t paddr; int err = -EINVAL; - int i; /* Only a single ramoops area allowed at a time, so fail extra * probes. */ - if (cxt->max_count) + if (cxt->max_dump_cnt) goto fail_out; - if (!pdata->mem_size || !pdata->record_size) { - pr_err("The memory size and the record size must be " + if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size && + !pdata->ftrace_size)) { + pr_err("The memory size and the record/console size must be " "non-zero\n"); goto fail_out; } pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); pdata->record_size = rounddown_pow_of_two(pdata->record_size); + pdata->console_size = rounddown_pow_of_two(pdata->console_size); + pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); - /* Check for the minimum memory size */ - if (pdata->mem_size < MIN_MEM_SIZE && - pdata->record_size < MIN_MEM_SIZE) { - pr_err("memory size too small, minimum is %lu\n", - MIN_MEM_SIZE); - goto fail_out; - } - - if (pdata->mem_size < pdata->record_size) { - pr_err("The memory size must be larger than the " - "records size\n"); - goto fail_out; - } - - cxt->max_count = pdata->mem_size / pdata->record_size; - cxt->count = 0; + cxt->dump_read_cnt = 0; cxt->size = pdata->mem_size; cxt->phys_addr = pdata->mem_address; cxt->record_size = pdata->record_size; + cxt->console_size = pdata->console_size; + cxt->ftrace_size = pdata->ftrace_size; cxt->dump_oops = pdata->dump_oops; - cxt->ecc = pdata->ecc; + cxt->ecc_size = pdata->ecc_size; - cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_count, GFP_KERNEL); - if (!cxt->przs) { - err = -ENOMEM; - dev_err(dev, "failed to initialize a prz array\n"); + paddr = cxt->phys_addr; + + dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size; + err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz); + if (err) goto fail_out; - } - for (i = 0; i < cxt->max_count; i++) { - size_t sz = cxt->record_size; - phys_addr_t start = cxt->phys_addr + sz * i; + err = ramoops_init_prz(dev, cxt, &cxt->cprz, &paddr, + cxt->console_size, 0); + if (err) + goto fail_init_cprz; - cxt->przs[i] = persistent_ram_new(start, sz, cxt->ecc); - if (IS_ERR(cxt->przs[i])) { - err = PTR_ERR(cxt->przs[i]); - dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", - sz, (unsigned long long)start, err); - goto fail_przs; - } + err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size, + LINUX_VERSION_CODE); + if (err) + goto fail_init_fprz; + + if (!cxt->przs && !cxt->cprz && !cxt->fprz) { + pr_err("memory size too small, minimum is %lu\n", + cxt->console_size + cxt->record_size + + cxt->ftrace_size); + goto fail_cnt; } cxt->pstore.data = cxt; - cxt->pstore.bufsize = cxt->przs[0]->buffer_size; + /* + * Console can handle any buffer size, so prefer dumps buffer + * size since usually it is smaller. + */ + if (cxt->przs) + cxt->pstore.bufsize = cxt->przs[0]->buffer_size; + else + cxt->pstore.bufsize = cxt->cprz->buffer_size; cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL); spin_lock_init(&cxt->pstore.buf_lock); if (!cxt->pstore.buf) { @@ -291,10 +443,9 @@ static int __init ramoops_probe(struct platform_device *pdev) record_size = pdata->record_size; dump_oops = pdata->dump_oops; - pr_info("attached 0x%lx@0x%llx (%ux0x%zx), ecc: %s\n", + pr_info("attached 0x%lx@0x%llx, ecc: %d\n", cxt->size, (unsigned long long)cxt->phys_addr, - cxt->max_count, cxt->record_size, - ramoops_ecc ? "on" : "off"); + cxt->ecc_size); return 0; @@ -302,11 +453,13 @@ fail_buf: kfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; - cxt->max_count = 0; -fail_przs: - for (i = 0; cxt->przs[i]; i++) - persistent_ram_free(cxt->przs[i]); - kfree(cxt->przs); + cxt->max_dump_cnt = 0; +fail_cnt: + kfree(cxt->fprz); +fail_init_fprz: + kfree(cxt->cprz); +fail_init_cprz: + ramoops_free_przs(cxt); fail_out: return err; } @@ -321,7 +474,7 @@ static int __exit ramoops_remove(struct platform_device *pdev) iounmap(cxt->virt_addr); release_mem_region(cxt->phys_addr, cxt->size); - cxt->max_count = 0; + cxt->max_dump_cnt = 0; /* TODO(kees): When pstore supports unregistering, call it here. */ kfree(cxt->pstore.buf); @@ -333,6 +486,7 @@ static int __exit ramoops_remove(struct platform_device *pdev) } static struct platform_driver ramoops_driver = { + .probe = ramoops_probe, .remove = __exit_p(ramoops_remove), .driver = { .name = "ramoops", @@ -340,45 +494,51 @@ static struct platform_driver ramoops_driver = { }, }; -static int __init ramoops_init(void) +static void ramoops_register_dummy(void) { - int ret; - ret = platform_driver_probe(&ramoops_driver, ramoops_probe); - if (ret == -ENODEV) { - /* - * If we didn't find a platform device, we use module parameters - * building platform data on the fly. - */ - pr_info("platform device not found, using module parameters\n"); - dummy_data = kzalloc(sizeof(struct ramoops_platform_data), - GFP_KERNEL); - if (!dummy_data) - return -ENOMEM; - dummy_data->mem_size = mem_size; - dummy_data->mem_address = mem_address; - dummy_data->record_size = record_size; - dummy_data->dump_oops = dump_oops; - dummy_data->ecc = ramoops_ecc; - dummy = platform_create_bundle(&ramoops_driver, ramoops_probe, - NULL, 0, dummy_data, - sizeof(struct ramoops_platform_data)); - - if (IS_ERR(dummy)) - ret = PTR_ERR(dummy); - else - ret = 0; + if (!mem_size) + return; + + pr_info("using module parameters\n"); + + dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL); + if (!dummy_data) { + pr_info("could not allocate pdata\n"); + return; + } + + dummy_data->mem_size = mem_size; + dummy_data->mem_address = mem_address; + dummy_data->record_size = record_size; + dummy_data->console_size = ramoops_console_size; + dummy_data->ftrace_size = ramoops_ftrace_size; + dummy_data->dump_oops = dump_oops; + /* + * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC + * (using 1 byte for ECC isn't much of use anyway). + */ + dummy_data->ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; + + dummy = platform_device_register_data(NULL, "ramoops", -1, + dummy_data, sizeof(struct ramoops_platform_data)); + if (IS_ERR(dummy)) { + pr_info("could not create platform device: %ld\n", + PTR_ERR(dummy)); } +} - return ret; +static int __init ramoops_init(void) +{ + ramoops_register_dummy(); + return platform_driver_register(&ramoops_driver); } +postcore_initcall(ramoops_init); static void __exit ramoops_exit(void) { platform_driver_unregister(&ramoops_driver); kfree(dummy_data); } - -module_init(ramoops_init); module_exit(ramoops_exit); MODULE_LICENSE("GPL"); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index c5fbdbbf81a..eecd2a8a84d 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -35,8 +35,6 @@ struct persistent_ram_buffer { #define PERSISTENT_RAM_SIG (0x43474244) /* DBGC */ -static __initdata LIST_HEAD(persistent_ram_list); - static inline size_t buffer_size(struct persistent_ram_zone *prz) { return atomic_read(&prz->buffer->size); @@ -116,7 +114,7 @@ static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz, int ecc_size = prz->ecc_size; int size = prz->ecc_block_size; - if (!prz->ecc) + if (!prz->ecc_size) return; block = buffer->data + (start & ~(ecc_block_size - 1)); @@ -135,7 +133,7 @@ static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; - if (!prz->ecc) + if (!prz->ecc_size) return; persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer), @@ -148,7 +146,7 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) uint8_t *block; uint8_t *par; - if (!prz->ecc) + if (!prz->ecc_size) return; block = buffer->data; @@ -174,29 +172,30 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) } static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, - size_t buffer_size) + int ecc_size) { int numerr; struct persistent_ram_buffer *buffer = prz->buffer; int ecc_blocks; + size_t ecc_total; + int ecc_symsize = 8; + int ecc_poly = 0x11d; - if (!prz->ecc) + if (!ecc_size) return 0; prz->ecc_block_size = 128; - prz->ecc_size = 16; - prz->ecc_symsize = 8; - prz->ecc_poly = 0x11d; + prz->ecc_size = ecc_size; ecc_blocks = DIV_ROUND_UP(prz->buffer_size, prz->ecc_block_size); - prz->buffer_size -= (ecc_blocks + 1) * prz->ecc_size; - - if (prz->buffer_size > buffer_size) { - pr_err("persistent_ram: invalid size %zu, non-ecc datasize %zu\n", - buffer_size, prz->buffer_size); + ecc_total = (ecc_blocks + 1) * prz->ecc_size; + if (ecc_total >= prz->buffer_size) { + pr_err("%s: invalid ecc_size %u (total %zu, buffer size %zu)\n", + __func__, prz->ecc_size, ecc_total, prz->buffer_size); return -EINVAL; } + prz->buffer_size -= ecc_total; prz->par_buffer = buffer->data + prz->buffer_size; prz->par_header = prz->par_buffer + ecc_blocks * prz->ecc_size; @@ -204,8 +203,7 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, * first consecutive root is 0 * primitive element to generate roots = 1 */ - prz->rs_decoder = init_rs(prz->ecc_symsize, prz->ecc_poly, 0, 1, - prz->ecc_size); + prz->rs_decoder = init_rs(ecc_symsize, ecc_poly, 0, 1, prz->ecc_size); if (prz->rs_decoder == NULL) { pr_info("persistent_ram: init_rs failed\n"); return -EINVAL; @@ -392,35 +390,36 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, return 0; } -static int __init persistent_ram_post_init(struct persistent_ram_zone *prz, bool ecc) +static int __devinit persistent_ram_post_init(struct persistent_ram_zone *prz, + u32 sig, int ecc_size) { int ret; - prz->ecc = ecc; - - ret = persistent_ram_init_ecc(prz, prz->buffer_size); + ret = persistent_ram_init_ecc(prz, ecc_size); if (ret) return ret; - if (prz->buffer->sig == PERSISTENT_RAM_SIG) { + sig ^= PERSISTENT_RAM_SIG; + + if (prz->buffer->sig == sig) { if (buffer_size(prz) > prz->buffer_size || buffer_start(prz) > buffer_size(prz)) pr_info("persistent_ram: found existing invalid buffer," " size %zu, start %zu\n", buffer_size(prz), buffer_start(prz)); else { - pr_info("persistent_ram: found existing buffer," + pr_debug("persistent_ram: found existing buffer," " size %zu, start %zu\n", buffer_size(prz), buffer_start(prz)); persistent_ram_save_old(prz); return 0; } } else { - pr_info("persistent_ram: no valid data in buffer" + pr_debug("persistent_ram: no valid data in buffer" " (sig = 0x%08x)\n", prz->buffer->sig); } - prz->buffer->sig = PERSISTENT_RAM_SIG; + prz->buffer->sig = sig; persistent_ram_zap(prz); return 0; @@ -428,19 +427,25 @@ static int __init persistent_ram_post_init(struct persistent_ram_zone *prz, bool void persistent_ram_free(struct persistent_ram_zone *prz) { - if (pfn_valid(prz->paddr >> PAGE_SHIFT)) { - vunmap(prz->vaddr); - } else { - iounmap(prz->vaddr); - release_mem_region(prz->paddr, prz->size); + if (!prz) + return; + + if (prz->vaddr) { + if (pfn_valid(prz->paddr >> PAGE_SHIFT)) { + vunmap(prz->vaddr); + } else { + iounmap(prz->vaddr); + release_mem_region(prz->paddr, prz->size); + } + prz->vaddr = NULL; } persistent_ram_free_old(prz); kfree(prz); } -struct persistent_ram_zone * __init persistent_ram_new(phys_addr_t start, - size_t size, - bool ecc) +struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start, + size_t size, u32 sig, + int ecc_size) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -455,85 +460,12 @@ struct persistent_ram_zone * __init persistent_ram_new(phys_addr_t start, if (ret) goto err; - persistent_ram_post_init(prz, ecc); - - return prz; -err: - kfree(prz); - return ERR_PTR(ret); -} - -#ifndef MODULE -static int __init persistent_ram_buffer_init(const char *name, - struct persistent_ram_zone *prz) -{ - int i; - struct persistent_ram *ram; - struct persistent_ram_descriptor *desc; - phys_addr_t start; - - list_for_each_entry(ram, &persistent_ram_list, node) { - start = ram->start; - for (i = 0; i < ram->num_descs; i++) { - desc = &ram->descs[i]; - if (!strcmp(desc->name, name)) - return persistent_ram_buffer_map(start, - desc->size, prz); - start += desc->size; - } - } - - return -EINVAL; -} - -static __init -struct persistent_ram_zone *__persistent_ram_init(struct device *dev, bool ecc) -{ - struct persistent_ram_zone *prz; - int ret = -ENOMEM; - - prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL); - if (!prz) { - pr_err("persistent_ram: failed to allocate persistent ram zone\n"); - goto err; - } - - ret = persistent_ram_buffer_init(dev_name(dev), prz); - if (ret) { - pr_err("persistent_ram: failed to initialize buffer\n"); + ret = persistent_ram_post_init(prz, sig, ecc_size); + if (ret) goto err; - } - - persistent_ram_post_init(prz, ecc); return prz; err: - kfree(prz); + persistent_ram_free(prz); return ERR_PTR(ret); } - -struct persistent_ram_zone * __init -persistent_ram_init_ringbuffer(struct device *dev, bool ecc) -{ - return __persistent_ram_init(dev, ecc); -} - -int __init persistent_ram_early_init(struct persistent_ram *ram) -{ - int ret; - - ret = memblock_reserve(ram->start, ram->size); - if (ret) { - pr_err("Failed to reserve persistent memory from %08lx-%08lx\n", - (long)ram->start, (long)(ram->start + ram->size - 1)); - return ret; - } - - list_add_tail(&ram->node, &persistent_ram_list); - - pr_info("Initialized persistent memory from %08lx-%08lx\n", - (long)ram->start, (long)(ram->start + ram->size - 1)); - - return 0; -} -#endif diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index a512c0b30e8..d024505ba00 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -95,7 +95,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir, return NULL; } -struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int ino; struct qnx4_inode_entry *de; diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h index 244d4620189..34e2d329c97 100644 --- a/fs/qnx4/qnx4.h +++ b/fs/qnx4/qnx4.h @@ -23,7 +23,7 @@ struct qnx4_inode_info { }; extern struct inode *qnx4_iget(struct super_block *, unsigned long); -extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd); +extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern unsigned long qnx4_count_free_blocks(struct super_block *sb); extern unsigned long qnx4_block_map(struct inode *inode, long iblock); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index e44012dc564..2049c814bda 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -622,7 +622,6 @@ static struct inode *qnx6_alloc_inode(struct super_block *sb) static void qnx6_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode)); } diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index 8a97289e04a..0561326a94f 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -13,7 +13,7 @@ #include "qnx6.h" struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { unsigned ino; struct page *page; diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index 6c5e02a0b6a..b00fcc960d3 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -45,7 +45,7 @@ struct qnx6_inode_info { extern struct inode *qnx6_iget(struct super_block *sb, unsigned ino); extern struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd); + unsigned int flags); #ifdef CONFIG_QNX6FS_DEBUG extern void qnx6_superblock_debug(struct qnx6_super_block *, diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 10cbe841cb7..36a29b753c7 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -78,7 +78,7 @@ #include <linux/quotaops.h> #include "../internal.h" /* ugh */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * There are three quota SMP locks. dq_list_lock protects all lists with quotas @@ -595,12 +595,14 @@ out: } EXPORT_SYMBOL(dquot_scan_active); -int dquot_quota_sync(struct super_block *sb, int type, int wait) +/* Write all dquot structures to quota files */ +int dquot_writeback_dquots(struct super_block *sb, int type) { struct list_head *dirty; struct dquot *dquot; struct quota_info *dqopt = sb_dqopt(sb); int cnt; + int err, ret = 0; mutex_lock(&dqopt->dqonoff_mutex); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -624,7 +626,9 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait) atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); dqstats_inc(DQST_LOOKUPS); - sb->dq_op->write_dquot(dquot); + err = sb->dq_op->write_dquot(dquot); + if (!ret && err) + err = ret; dqput(dquot); spin_lock(&dq_list_lock); } @@ -638,7 +642,21 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait) dqstats_inc(DQST_SYNCS); mutex_unlock(&dqopt->dqonoff_mutex); - if (!wait || (dqopt->flags & DQUOT_QUOTA_SYS_FILE)) + return ret; +} +EXPORT_SYMBOL(dquot_writeback_dquots); + +/* Write all dquot structures to disk and make them visible from userspace */ +int dquot_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) return 0; /* This is not very clever (and fast) but currently I don't know about diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 9a391204ca2..6f155788cbc 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -9,7 +9,7 @@ #include <linux/namei.h> #include <linux/slab.h> #include <asm/current.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -47,7 +47,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, static void quota_sync_one(struct super_block *sb, void *arg) { if (sb->s_qcop && sb->s_qcop->quota_sync) - sb->s_qcop->quota_sync(sb, *(int *)arg, 1); + sb->s_qcop->quota_sync(sb, *(int *)arg); } static int quota_sync_all(int type) @@ -270,7 +270,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_SYNC: if (!sb->s_qcop->quota_sync) return -ENOSYS; - return sb->s_qcop->quota_sync(sb, type, 1); + return sb->s_qcop->quota_sync(sb, type); case Q_XQUOTAON: case Q_XQUOTAOFF: case Q_XQUOTARM: diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index a1fdabe21de..eab8c09d380 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -114,7 +114,7 @@ static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) return retval; } -static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) +static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return ramfs_mknod(dir, dentry, mode | S_IFREG, 0); } diff --git a/fs/read_write.c b/fs/read_write.c index c20614f86c0..1adfb691e4f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -55,10 +55,11 @@ static loff_t lseek_execute(struct file *file, struct inode *inode, * @file: file structure to seek on * @offset: file offset to seek to * @origin: type of seek - * @size: max size of file system + * @size: max size of this file in file system + * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom - * file size. + * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) @@ -67,13 +68,13 @@ static loff_t lseek_execute(struct file *file, struct inode *inode, */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int origin, - loff_t maxsize) + loff_t maxsize, loff_t eof) { struct inode *inode = file->f_mapping->host; switch (origin) { case SEEK_END: - offset += i_size_read(inode); + offset += eof; break; case SEEK_CUR: /* @@ -99,7 +100,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int origin, * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= i_size_read(inode)) + if (offset >= eof) return -ENXIO; break; case SEEK_HOLE: @@ -107,9 +108,9 @@ generic_file_llseek_size(struct file *file, loff_t offset, int origin, * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= i_size_read(inode)) + if (offset >= eof) return -ENXIO; - offset = i_size_read(inode); + offset = eof; break; } @@ -132,7 +133,8 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, origin, - inode->i_sb->s_maxbytes); + inode->i_sb->s_maxbytes, + i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 84e8a69cee9..8567fb84760 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -322,7 +322,7 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, } static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { int retval; int lock_depth; @@ -573,7 +573,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) } static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { int retval; struct inode *inode; @@ -634,8 +634,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: @@ -712,8 +712,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode goto out_failed; } - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: @@ -800,8 +800,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode // the above add_entry did not update dir's stat data reiserfs_update_sd(&th, dir); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: reiserfs_write_unlock_once(dir->i_sb, lock_depth); @@ -1096,8 +1096,8 @@ static int reiserfs_symlink(struct inode *parent_dir, goto out_failed; } - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); retval = journal_end(&th, parent_dir->i_sb, jbegin_count); out_failed: reiserfs_write_unlock(parent_dir->i_sb); diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 2c1ade692cc..e60e87035bb 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -403,7 +403,7 @@ static void *r_start(struct seq_file *m, loff_t * pos) if (l) return NULL; - if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, s))) + if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, s))) return NULL; up_write(&s->s_umount); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 651ce767b55..7a37dabf5a9 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -68,6 +68,11 @@ static int reiserfs_sync_fs(struct super_block *s, int wait) { struct reiserfs_transaction_handle th; + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(s, -1); reiserfs_write_lock(s); if (!journal_begin(&th, s, 1)) if (!journal_end_sync(&th, s, 1)) diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 46fc1c20a6b..d319963aeb1 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -62,7 +62,7 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); - return dir->i_op->create(dir, dentry, mode, NULL); + return dir->i_op->create(dir, dentry, mode, true); } #endif @@ -942,7 +942,7 @@ int reiserfs_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } -static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) +static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags) { return -EPERM; } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index e64f6b5f7ae..77c5f217398 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -210,7 +210,7 @@ out: * look up an entry in a directory */ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { unsigned long offset, maxoff; struct inode *inode; diff --git a/fs/seq_file.c b/fs/seq_file.c index 0cbd0494b79..14cf9de1dbe 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -385,15 +385,12 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc) } EXPORT_SYMBOL(seq_escape); -int seq_printf(struct seq_file *m, const char *f, ...) +int seq_vprintf(struct seq_file *m, const char *f, va_list args) { - va_list args; int len; if (m->count < m->size) { - va_start(args, f); len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); - va_end(args); if (m->count + len < m->size) { m->count += len; return 0; @@ -402,6 +399,19 @@ int seq_printf(struct seq_file *m, const char *f, ...) seq_set_overflow(m); return -1; } +EXPORT_SYMBOL(seq_vprintf); + +int seq_printf(struct seq_file *m, const char *f, ...) +{ + int ret; + va_list args; + + va_start(args, f); + ret = seq_vprintf(m, f, args); + va_end(args); + + return ret; +} EXPORT_SYMBOL(seq_printf); /** diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index abcc58f3c15..7834a517f7f 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -134,7 +134,7 @@ out: static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { const unsigned char *name = dentry->d_name.name; int len = dentry->d_name.len; diff --git a/fs/super.c b/fs/super.c index cf001775617..c743fb3be4b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -105,11 +105,12 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to + * @flags: the mount flags * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ -static struct super_block *alloc_super(struct file_system_type *type) +static struct super_block *alloc_super(struct file_system_type *type, int flags) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; @@ -136,6 +137,7 @@ static struct super_block *alloc_super(struct file_system_type *type) #else INIT_LIST_HEAD(&s->s_files); #endif + s->s_flags = flags; s->s_bdi = &default_backing_dev_info; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); @@ -415,11 +417,13 @@ EXPORT_SYMBOL(generic_shutdown_super); * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback + * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), + int flags, void *data) { struct super_block *s = NULL; @@ -450,7 +454,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type); + s = alloc_super(type, flags); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -925,13 +929,12 @@ struct dentry *mount_ns(struct file_system_type *fs_type, int flags, { struct super_block *sb; - sb = sget(fs_type, ns_test_super, ns_set_super, data); + sb = sget(fs_type, ns_test_super, ns_set_super, flags, data); if (IS_ERR(sb)) return ERR_CAST(sb); if (!sb->s_root) { int err; - sb->s_flags = flags; err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) { deactivate_locked_super(sb); @@ -992,7 +995,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error = -EBUSY; goto error_bdev; } - s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); + s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC, + bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) goto error_s; @@ -1017,7 +1021,6 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags | MS_NOSEC; s->s_mode = mode; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); @@ -1062,13 +1065,11 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, int (*fill_super)(struct super_block *, void *, int)) { int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); + struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); - s->s_flags = flags; - error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); @@ -1091,11 +1092,10 @@ struct dentry *mount_single(struct file_system_type *fs_type, struct super_block *s; int error; - s = sget(fs_type, compare_single, set_anon_super, NULL); + s = sget(fs_type, compare_single, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { - s->s_flags = flags; error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); diff --git a/fs/sync.c b/fs/sync.c index 11e3d1c4490..eb8722dc556 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -29,16 +29,6 @@ */ static int __sync_filesystem(struct super_block *sb, int wait) { - /* - * This should be safe, as we require bdi backing to actually - * write out data in the first place - */ - if (sb->s_bdi == &noop_backing_dev_info) - return 0; - - if (sb->s_qcop && sb->s_qcop->quota_sync) - sb->s_qcop->quota_sync(sb, -1, wait); - if (wait) sync_inodes_sb(sb); else @@ -77,29 +67,48 @@ int sync_filesystem(struct super_block *sb) } EXPORT_SYMBOL_GPL(sync_filesystem); -static void sync_one_sb(struct super_block *sb, void *arg) +static void sync_inodes_one_sb(struct super_block *sb, void *arg) { if (!(sb->s_flags & MS_RDONLY)) - __sync_filesystem(sb, *(int *)arg); + sync_inodes_sb(sb); } -/* - * Sync all the data for all the filesystems (called by sys_sync() and - * emergency sync) - */ -static void sync_filesystems(int wait) + +static void sync_fs_one_sb(struct super_block *sb, void *arg) { - iterate_supers(sync_one_sb, &wait); + if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, *(int *)arg); +} + +static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) +{ + filemap_fdatawrite(bdev->bd_inode->i_mapping); +} + +static void fdatawait_one_bdev(struct block_device *bdev, void *arg) +{ + filemap_fdatawait(bdev->bd_inode->i_mapping); } /* - * sync everything. Start out by waking pdflush, because that writes back - * all queues in parallel. + * Sync everything. We start by waking flusher threads so that most of + * writeback runs on all devices in parallel. Then we sync all inodes reliably + * which effectively also waits for all flusher threads to finish doing + * writeback. At this point all data is on disk so metadata should be stable + * and we tell filesystems to sync their metadata via ->sync_fs() calls. + * Finally, we writeout all block devices because some filesystems (e.g. ext2) + * just write metadata (such as inodes or bitmaps) to block device page cache + * and do not sync it on their own in ->sync_fs(). */ SYSCALL_DEFINE0(sync) { + int nowait = 0, wait = 1; + wakeup_flusher_threads(0, WB_REASON_SYNC); - sync_filesystems(0); - sync_filesystems(1); + iterate_supers(sync_inodes_one_sb, NULL); + iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_fs_one_sb, &wait); + iterate_bdevs(fdatawrite_one_bdev, NULL); + iterate_bdevs(fdatawait_one_bdev, NULL); if (unlikely(laptop_mode)) laptop_sync_completion(); return 0; @@ -107,12 +116,18 @@ SYSCALL_DEFINE0(sync) static void do_sync_work(struct work_struct *work) { + int nowait = 0; + /* * Sync twice to reduce the possibility we skipped some inodes / pages * because they were temporarily locked */ - sync_filesystems(0); - sync_filesystems(0); + iterate_supers(sync_inodes_one_sb, &nowait); + iterate_supers(sync_fs_one_sb, &nowait); + iterate_bdevs(fdatawrite_one_bdev, NULL); + iterate_supers(sync_inodes_one_sb, &nowait); + iterate_supers(sync_fs_one_sb, &nowait); + iterate_bdevs(fdatawrite_one_bdev, NULL); printk("Emergency Sync complete\n"); kfree(work); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 1cdfb53199a..6b0bb00d4d2 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -300,16 +300,16 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) static int sysfs_dentry_delete(const struct dentry *dentry) { struct sysfs_dirent *sd = dentry->d_fsdata; - return !!(sd->s_flags & SYSFS_FLAG_REMOVED); + return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED)); } -static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) +static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) { struct sysfs_dirent *sd; int is_dir; int type; - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; sd = dentry->d_fsdata; @@ -365,18 +365,15 @@ out_bad: return 0; } -static void sysfs_dentry_iput(struct dentry *dentry, struct inode *inode) +static void sysfs_dentry_release(struct dentry *dentry) { - struct sysfs_dirent * sd = dentry->d_fsdata; - - sysfs_put(sd); - iput(inode); + sysfs_put(dentry->d_fsdata); } -static const struct dentry_operations sysfs_dentry_ops = { +const struct dentry_operations sysfs_dentry_ops = { .d_revalidate = sysfs_dentry_revalidate, .d_delete = sysfs_dentry_delete, - .d_iput = sysfs_dentry_iput, + .d_release = sysfs_dentry_release, }; struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) @@ -774,7 +771,7 @@ int sysfs_create_dir(struct kobject * kobj) } static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct dentry *ret = NULL; struct dentry *parent = dentry->d_parent; @@ -796,6 +793,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, ret = ERR_PTR(-ENOENT); goto out_unlock; } + dentry->d_fsdata = sysfs_get(sd); /* attach dentry and inode */ inode = sysfs_get_inode(dir->i_sb, sd); @@ -805,16 +803,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, } /* instantiate and hash dentry */ - ret = d_find_alias(inode); - if (!ret) { - d_set_d_op(dentry, &sysfs_dentry_ops); - dentry->d_fsdata = sysfs_get(sd); - d_add(dentry, inode); - } else { - d_move(ret, dentry); - iput(inode); - } - + ret = d_materialise_unique(dentry, inode); out_unlock: mutex_unlock(&sysfs_mutex); return ret; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 52c3bdb66a8..71eb7e25392 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -68,6 +68,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) } root->d_fsdata = &sysfs_root; sb->s_root = root; + sb->s_d_op = &sysfs_dentry_ops; return 0; } @@ -117,13 +118,12 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) info->ns[type] = kobj_ns_grab_current(type); - sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); + sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info); if (IS_ERR(sb) || sb->s_fs_info != info) free_sysfs_super_info(info); if (IS_ERR(sb)) return ERR_CAST(sb); if (!sb->s_root) { - sb->s_flags = flags; error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(sb); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 661a9639570..d73c0932bbd 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -157,6 +157,7 @@ extern struct kmem_cache *sysfs_dir_cachep; */ extern struct mutex sysfs_mutex; extern spinlock_t sysfs_assoc_lock; +extern const struct dentry_operations sysfs_dentry_ops; extern const struct file_operations sysfs_dir_operations; extern const struct inode_operations sysfs_dir_inode_operations; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 08d0b2568cd..80e1e2b18df 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -43,7 +43,6 @@ static int sysv_sync_fs(struct super_block *sb, int wait) * then attach current time stamp. * But if the filesystem was marked clean, keep it clean. */ - sb->s_dirt = 0; old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); if (sbi->s_type == FSTYPE_SYSV4) { if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) @@ -57,23 +56,12 @@ static int sysv_sync_fs(struct super_block *sb, int wait) return 0; } -static void sysv_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - sysv_sync_fs(sb, 1); - else - sb->s_dirt = 0; -} - static int sysv_remount(struct super_block *sb, int *flags, char *data) { struct sysv_sb_info *sbi = SYSV_SB(sb); - lock_super(sb); + if (sbi->s_forced_ro) *flags |= MS_RDONLY; - if (*flags & MS_RDONLY) - sysv_write_super(sb); - unlock_super(sb); return 0; } @@ -81,9 +69,6 @@ static void sysv_put_super(struct super_block *sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); - if (sb->s_dirt) - sysv_write_super(sb); - if (!(sb->s_flags & MS_RDONLY)) { /* XXX ext2 also updates the state here */ mark_buffer_dirty(sbi->s_bh1); @@ -357,7 +342,6 @@ const struct super_operations sysv_sops = { .write_inode = sysv_write_inode, .evict_inode = sysv_evict_inode, .put_super = sysv_put_super, - .write_super = sysv_write_super, .sync_fs = sysv_sync_fs, .remount_fs = sysv_remount, .statfs = sysv_statfs, diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index d7466e29361..1c0d5f26476 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -43,7 +43,7 @@ const struct dentry_operations sysv_dentry_operations = { .d_hash = sysv_hash, }; -static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { struct inode * inode = NULL; ino_t ino; @@ -80,7 +80,7 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, return err; } -static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) +static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { return sysv_mknod(dir, dentry, mode, 0); } diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 11b07672f6c..0bc35fdc58e 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -117,7 +117,6 @@ static inline void dirty_sb(struct super_block *sb) mark_buffer_dirty(sbi->s_bh1); if (sbi->s_bh1 != sbi->s_bh2) mark_buffer_dirty(sbi->s_bh2); - sb->s_dirt = 1; } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 92df3b08153..bb3167257aa 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2802,6 +2802,8 @@ static ssize_t dfs_file_read(struct file *file, char __user *u, size_t count, val = d->chk_fs; else if (dent == d->dfs_tst_rcvry) val = d->tst_rcvry; + else if (dent == d->dfs_ro_error) + val = c->ro_error; else return -EINVAL; @@ -2885,6 +2887,8 @@ static ssize_t dfs_file_write(struct file *file, const char __user *u, d->chk_fs = val; else if (dent == d->dfs_tst_rcvry) d->tst_rcvry = val; + else if (dent == d->dfs_ro_error) + c->ro_error = !!val; else return -EINVAL; @@ -2996,6 +3000,13 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) goto out_remove; d->dfs_tst_rcvry = dent; + fname = "ro_error"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_ro_error = dent; + return 0; out_remove: diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 486a8e024fb..8b8cc4e945f 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -79,6 +79,10 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, * @dfs_chk_lprops: debugfs knob to enable UBIFS LEP properties extra checks * @dfs_chk_fs: debugfs knob to enable UBIFS contents extra checks * @dfs_tst_rcvry: debugfs knob to enable UBIFS recovery testing + * @dfs_ro_error: debugfs knob to switch UBIFS to R/O mode (different to + * re-mounting to R/O mode because it does not flush any buffers + * and UBIFS just starts returning -EROFS on all write + * operations) */ struct ubifs_debug_info { struct ubifs_zbranch old_zroot; @@ -122,6 +126,7 @@ struct ubifs_debug_info { struct dentry *dfs_chk_lprops; struct dentry *dfs_chk_fs; struct dentry *dfs_tst_rcvry; + struct dentry *dfs_ro_error; }; /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index a6d42efc76d..c95681cf1b7 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -184,7 +184,7 @@ static int dbg_check_name(const struct ubifs_info *c, } static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { int err; union ubifs_key key; @@ -246,7 +246,7 @@ out: } static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; @@ -969,7 +969,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; - unsigned int saved_nlink; + unsigned int uninitialized_var(saved_nlink); /* * Budget request settings: deletion direntry, new direntry, removing diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index b02734db187..cebf17ea045 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -176,7 +176,7 @@ int ubifs_orphan_start_commit(struct ubifs_info *c) *last = orphan; last = &orphan->cnext; } - *last = orphan->cnext; + *last = NULL; c->cmt_orphans = c->new_orphans; c->new_orphans = 0; dbg_cmt("%d orphans to commit", c->cmt_orphans); @@ -382,7 +382,7 @@ static int consolidate(struct ubifs_info *c) last = &orphan->cnext; cnt += 1; } - *last = orphan->cnext; + *last = NULL; ubifs_assert(cnt == c->tot_orphans - c->new_orphans); c->cmt_orphans = cnt; c->ohead_lnum = c->orph_first; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 3a2da7e476e..eba46d4a761 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -1007,7 +1007,7 @@ out: */ int ubifs_replay_journal(struct ubifs_info *c) { - int err, i, lnum, offs, free; + int err, lnum, free; BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); @@ -1025,25 +1025,17 @@ int ubifs_replay_journal(struct ubifs_info *c) dbg_mnt("start replaying the journal"); c->replaying = 1; lnum = c->ltail_lnum = c->lhead_lnum; - offs = c->lhead_offs; - for (i = 0; i < c->log_lebs; i++, lnum++) { - if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) { - /* - * The log is logically circular, we reached the last - * LEB, switch to the first one. - */ - lnum = UBIFS_LOG_LNUM; - offs = 0; - } - err = replay_log_leb(c, lnum, offs, c->sbuf); + lnum = UBIFS_LOG_LNUM; + do { + err = replay_log_leb(c, lnum, 0, c->sbuf); if (err == 1) /* We hit the end of the log */ break; if (err) goto out; - offs = 0; - } + lnum = ubifs_next_log_lnum(c, lnum); + } while (lnum != UBIFS_LOG_LNUM); err = replay_buds(c); if (err) diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index ef3d1ba6d99..15e2fc5aa60 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -718,8 +718,12 @@ static int fixup_free_space(struct ubifs_info *c) lnum = ubifs_next_log_lnum(c, lnum); } - /* Fixup the current log head */ - err = fixup_leb(c, c->lhead_lnum, c->lhead_offs); + /* + * Fixup the log head which contains the only a CS node at the + * beginning. + */ + err = fixup_leb(c, c->lhead_lnum, + ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size)); if (err) goto out; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 5862dd9d278..1c766c39c03 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2136,7 +2136,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); - sb = sget(fs_type, sb_test, sb_set, c); + sb = sget(fs_type, sb_test, sb_set, flags, c); if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); @@ -2153,7 +2153,6 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, goto out_deact; } } else { - sb->s_flags = flags; err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) goto out_deact; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 873e1bab9c4..fafaad795cd 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1247,7 +1247,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) { struct fileEntry *fe; struct extendedFileEntry *efe; - int offset; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct udf_inode_info *iinfo = UDF_I(inode); unsigned int link_count; @@ -1359,7 +1358,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint); - offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr; } else { inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); @@ -1381,8 +1379,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); - offset = sizeof(struct extendedFileEntry) + - iinfo->i_lenEAttr; } switch (fe->icbTag.fileType) { diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 18024178ac4..95fee278ab9 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -251,7 +251,7 @@ out_ok: } static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode = NULL; struct fileIdentDesc cfi; @@ -551,7 +551,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, } static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct udf_fileident_bh fibh; struct inode *inode; @@ -1279,6 +1279,7 @@ static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp, *lenp = 3; fid->udf.block = location.logicalBlockNum; fid->udf.partref = location.partitionReferenceNum; + fid->udf.parent_partref = 0; fid->udf.generation = inode->i_generation; if (parent) { diff --git a/fs/udf/super.c b/fs/udf/super.c index 8d86a8706c0..dcbf98722af 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -252,6 +252,63 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) return 0; } +static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) +{ + int i; + int nr_groups = bitmap->s_nr_groups; + int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * + nr_groups); + + for (i = 0; i < nr_groups; i++) + if (bitmap->s_block_bitmap[i]) + brelse(bitmap->s_block_bitmap[i]); + + if (size <= PAGE_SIZE) + kfree(bitmap); + else + vfree(bitmap); +} + +static void udf_free_partition(struct udf_part_map *map) +{ + int i; + struct udf_meta_data *mdata; + + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) + iput(map->s_uspace.s_table); + if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) + iput(map->s_fspace.s_table); + if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) + udf_sb_free_bitmap(map->s_uspace.s_bitmap); + if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) + udf_sb_free_bitmap(map->s_fspace.s_bitmap); + if (map->s_partition_type == UDF_SPARABLE_MAP15) + for (i = 0; i < 4; i++) + brelse(map->s_type_specific.s_sparing.s_spar_map[i]); + else if (map->s_partition_type == UDF_METADATA_MAP25) { + mdata = &map->s_type_specific.s_metadata; + iput(mdata->s_metadata_fe); + mdata->s_metadata_fe = NULL; + + iput(mdata->s_mirror_fe); + mdata->s_mirror_fe = NULL; + + iput(mdata->s_bitmap_fe); + mdata->s_bitmap_fe = NULL; + } +} + +static void udf_sb_free_partitions(struct super_block *sb) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + int i; + + for (i = 0; i < sbi->s_partitions; i++) + udf_free_partition(&sbi->s_partmaps[i]); + kfree(sbi->s_partmaps); + sbi->s_partmaps = NULL; +} + static int udf_show_options(struct seq_file *seq, struct dentry *root) { struct super_block *sb = root->d_sb; @@ -1283,7 +1340,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, BUG_ON(ident != TAG_IDENT_LVD); lvd = (struct logicalVolDesc *)bh->b_data; table_len = le32_to_cpu(lvd->mapTableLength); - if (sizeof(*lvd) + table_len > sb->s_blocksize) { + if (table_len > sb->s_blocksize - sizeof(*lvd)) { udf_err(sb, "error loading logical volume descriptor: " "Partition table too long (%u > %lu)\n", table_len, sb->s_blocksize - sizeof(*lvd)); @@ -1596,7 +1653,11 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, /* responsible for finding the PartitionDesc(s) */ if (!udf_process_sequence(sb, main_s, main_e, fileset)) return 1; - return !udf_process_sequence(sb, reserve_s, reserve_e, fileset); + udf_sb_free_partitions(sb); + if (!udf_process_sequence(sb, reserve_s, reserve_e, fileset)) + return 1; + udf_sb_free_partitions(sb); + return 0; } /* @@ -1861,55 +1922,8 @@ u64 lvid_get_unique_id(struct super_block *sb) return ret; } -static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) -{ - int i; - int nr_groups = bitmap->s_nr_groups; - int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * - nr_groups); - - for (i = 0; i < nr_groups; i++) - if (bitmap->s_block_bitmap[i]) - brelse(bitmap->s_block_bitmap[i]); - - if (size <= PAGE_SIZE) - kfree(bitmap); - else - vfree(bitmap); -} - -static void udf_free_partition(struct udf_part_map *map) -{ - int i; - struct udf_meta_data *mdata; - - if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) - iput(map->s_uspace.s_table); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - iput(map->s_fspace.s_table); - if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) - udf_sb_free_bitmap(map->s_uspace.s_bitmap); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - udf_sb_free_bitmap(map->s_fspace.s_bitmap); - if (map->s_partition_type == UDF_SPARABLE_MAP15) - for (i = 0; i < 4; i++) - brelse(map->s_type_specific.s_sparing.s_spar_map[i]); - else if (map->s_partition_type == UDF_METADATA_MAP25) { - mdata = &map->s_type_specific.s_metadata; - iput(mdata->s_metadata_fe); - mdata->s_metadata_fe = NULL; - - iput(mdata->s_mirror_fe); - mdata->s_mirror_fe = NULL; - - iput(mdata->s_bitmap_fe); - mdata->s_bitmap_fe = NULL; - } -} - static int udf_fill_super(struct super_block *sb, void *options, int silent) { - int i; int ret; struct inode *inode = NULL; struct udf_options uopt; @@ -1974,7 +1988,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sb->s_op = &udf_sb_ops; sb->s_export_op = &udf_export_ops; - sb->s_dirt = 0; sb->s_magic = UDF_SUPER_MAGIC; sb->s_time_gran = 1000; @@ -2072,9 +2085,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) error_out: if (sbi->s_vat_inode) iput(sbi->s_vat_inode); - if (sbi->s_partitions) - for (i = 0; i < sbi->s_partitions; i++) - udf_free_partition(&sbi->s_partmaps[i]); #ifdef CONFIG_UDF_NLS if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); @@ -2082,8 +2092,7 @@ error_out: if (!(sb->s_flags & MS_RDONLY)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); - - kfree(sbi->s_partmaps); + udf_sb_free_partitions(sb); kfree(sbi); sb->s_fs_info = NULL; @@ -2096,10 +2105,6 @@ void _udf_err(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - /* mark sb error */ - if (!(sb->s_flags & MS_RDONLY)) - sb->s_dirt = 1; - va_start(args, fmt); vaf.fmt = fmt; @@ -2128,16 +2133,12 @@ void _udf_warn(struct super_block *sb, const char *function, static void udf_put_super(struct super_block *sb) { - int i; struct udf_sb_info *sbi; sbi = UDF_SB(sb); if (sbi->s_vat_inode) iput(sbi->s_vat_inode); - if (sbi->s_partitions) - for (i = 0; i < sbi->s_partitions; i++) - udf_free_partition(&sbi->s_partmaps[i]); #ifdef CONFIG_UDF_NLS if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); @@ -2145,7 +2146,7 @@ static void udf_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); - kfree(sbi->s_partmaps); + udf_sb_free_partitions(sb); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } @@ -2161,7 +2162,6 @@ static int udf_sync_fs(struct super_block *sb, int wait) * the buffer for IO */ mark_buffer_dirty(sbi->s_lvid_bh); - sb->s_dirt = 0; sbi->s_lvid_dirty = 0; } mutex_unlock(&sbi->s_alloc_mutex); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 4b98fee8e16..8a9657d7f7c 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -248,7 +248,7 @@ void udf_truncate_extents(struct inode *inode) /* We managed to free all extents in the * indirect extent - free it too */ BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, &epos.block, + udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; @@ -275,7 +275,7 @@ void udf_truncate_extents(struct inode *inode) if (indirect_ext_len) { BUG_ON(!epos.bh); - udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len); + udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; mark_inode_dirty(inode); diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index ebe10314e51..de038da6f6b 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -129,7 +129,6 @@ static inline void udf_updated_lvid(struct super_block *sb) WARN_ON_ONCE(((struct logicalVolIntegrityDesc *) bh->b_data)->integrityType != cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN)); - sb->s_dirt = 1; UDF_SB(sb)->s_lvid_dirty = 1; } extern u64 lvid_get_unique_id(struct super_block *sb); diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 42694e11c23..1b3e410bf33 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -116,7 +116,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); - sb->s_dirt = 1; + ufs_mark_sb_dirty(sb); unlock_super (sb); UFSD("EXIT\n"); @@ -214,7 +214,7 @@ do_more: goto do_more; } - sb->s_dirt = 1; + ufs_mark_sb_dirty(sb); unlock_super (sb); UFSD("EXIT\n"); return; @@ -557,7 +557,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); - sb->s_dirt = 1; + ufs_mark_sb_dirty(sb); UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment); @@ -677,7 +677,7 @@ succed: ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); - sb->s_dirt = 1; + ufs_mark_sb_dirty(sb); result += cgno * uspi->s_fpg; UFSD("EXIT3, result %llu\n", (unsigned long long)result); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 4ec5c1085a8..e84cbe21b98 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -116,7 +116,7 @@ void ufs_free_inode (struct inode * inode) if (sb->s_flags & MS_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); - sb->s_dirt = 1; + ufs_mark_sb_dirty(sb); unlock_super (sb); UFSD("EXIT\n"); } @@ -288,7 +288,7 @@ cg_found: ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); - sb->s_dirt = 1; + ufs_mark_sb_dirty(sb); inode->i_ino = cg * uspi->s_ipg + bit; inode_init_owner(inode, dir, mode); diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index a2281cadefa..90d74b8f8eb 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -46,7 +46,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) return err; } -static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) { struct inode * inode = NULL; ino_t ino; @@ -71,7 +71,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru * with d_instantiate(). */ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct inode *inode; int err; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 302f340d007..444927e5706 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -302,7 +302,7 @@ void ufs_error (struct super_block * sb, const char * function, if (!(sb->s_flags & MS_RDONLY)) { usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); - sb->s_dirt = 1; + ufs_mark_sb_dirty(sb); sb->s_flags |= MS_RDONLY; } va_start (args, fmt); @@ -334,7 +334,7 @@ void ufs_panic (struct super_block * sb, const char * function, if (!(sb->s_flags & MS_RDONLY)) { usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); - sb->s_dirt = 1; + ufs_mark_sb_dirty(sb); } va_start (args, fmt); vsnprintf (error_buf, sizeof(error_buf), fmt, args); @@ -691,6 +691,83 @@ static void ufs_put_super_internal(struct super_block *sb) UFSD("EXIT\n"); } +static int ufs_sync_fs(struct super_block *sb, int wait) +{ + struct ufs_sb_private_info * uspi; + struct ufs_super_block_first * usb1; + struct ufs_super_block_third * usb3; + unsigned flags; + + lock_ufs(sb); + lock_super(sb); + + UFSD("ENTER\n"); + + flags = UFS_SB(sb)->s_flags; + uspi = UFS_SB(sb)->s_uspi; + usb1 = ubh_get_usb_first(uspi); + usb3 = ubh_get_usb_third(uspi); + + usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + if ((flags & UFS_ST_MASK) == UFS_ST_SUN || + (flags & UFS_ST_MASK) == UFS_ST_SUNOS || + (flags & UFS_ST_MASK) == UFS_ST_SUNx86) + ufs_set_fs_state(sb, usb1, usb3, + UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); + ufs_put_cstotal(sb); + + UFSD("EXIT\n"); + unlock_super(sb); + unlock_ufs(sb); + + return 0; +} + +static void delayed_sync_fs(struct work_struct *work) +{ + struct ufs_sb_info *sbi; + + sbi = container_of(work, struct ufs_sb_info, sync_work.work); + + spin_lock(&sbi->work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->work_lock); + + ufs_sync_fs(sbi->sb, 1); +} + +void ufs_mark_sb_dirty(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + unsigned long delay; + + spin_lock(&sbi->work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->sync_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->work_lock); +} + +static void ufs_put_super(struct super_block *sb) +{ + struct ufs_sb_info * sbi = UFS_SB(sb); + + UFSD("ENTER\n"); + + if (!(sb->s_flags & MS_RDONLY)) + ufs_put_super_internal(sb); + cancel_delayed_work_sync(&sbi->sync_work); + + ubh_brelse_uspi (sbi->s_uspi); + kfree (sbi->s_uspi); + kfree (sbi); + sb->s_fs_info = NULL; + UFSD("EXIT\n"); + return; +} + static int ufs_fill_super(struct super_block *sb, void *data, int silent) { struct ufs_sb_info * sbi; @@ -716,6 +793,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) goto failed_nomem; sb->s_fs_info = sbi; + sbi->sb = sb; UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); @@ -727,6 +805,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) } #endif mutex_init(&sbi->mutex); + spin_lock_init(&sbi->work_lock); + INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); /* * Set default mount options * Parse mount options @@ -1191,68 +1271,6 @@ failed_nomem: return -ENOMEM; } -static int ufs_sync_fs(struct super_block *sb, int wait) -{ - struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; - struct ufs_super_block_third * usb3; - unsigned flags; - - lock_ufs(sb); - lock_super(sb); - - UFSD("ENTER\n"); - - flags = UFS_SB(sb)->s_flags; - uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); - usb3 = ubh_get_usb_third(uspi); - - usb1->fs_time = cpu_to_fs32(sb, get_seconds()); - if ((flags & UFS_ST_MASK) == UFS_ST_SUN || - (flags & UFS_ST_MASK) == UFS_ST_SUNOS || - (flags & UFS_ST_MASK) == UFS_ST_SUNx86) - ufs_set_fs_state(sb, usb1, usb3, - UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); - ufs_put_cstotal(sb); - sb->s_dirt = 0; - - UFSD("EXIT\n"); - unlock_super(sb); - unlock_ufs(sb); - - return 0; -} - -static void ufs_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - ufs_sync_fs(sb, 1); - else - sb->s_dirt = 0; -} - -static void ufs_put_super(struct super_block *sb) -{ - struct ufs_sb_info * sbi = UFS_SB(sb); - - UFSD("ENTER\n"); - - if (sb->s_dirt) - ufs_write_super(sb); - - if (!(sb->s_flags & MS_RDONLY)) - ufs_put_super_internal(sb); - - ubh_brelse_uspi (sbi->s_uspi); - kfree (sbi->s_uspi); - kfree (sbi); - sb->s_fs_info = NULL; - UFSD("EXIT\n"); - return; -} - - static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) { struct ufs_sb_private_info * uspi; @@ -1308,7 +1326,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufs_set_fs_state(sb, usb1, usb3, UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); ubh_mark_buffer_dirty (USPI_UBH(uspi)); - sb->s_dirt = 0; sb->s_flags |= MS_RDONLY; } else { /* @@ -1458,7 +1475,6 @@ static const struct super_operations ufs_super_ops = { .write_inode = ufs_write_inode, .evict_inode = ufs_evict_inode, .put_super = ufs_put_super, - .write_super = ufs_write_super, .sync_fs = ufs_sync_fs, .statfs = ufs_statfs, .remount_fs = ufs_remount, diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 528750b7e70..343e6fc571e 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -20,6 +20,10 @@ struct ufs_sb_info { unsigned s_mount_opt; struct mutex mutex; struct task_struct *mutex_owner; + struct super_block *sb; + int work_queued; /* non-zero if the delayed work is queued */ + struct delayed_work sync_work; /* FS sync delayed work */ + spinlock_t work_lock; /* protects sync_work and work_queued */ }; struct ufs_inode_info { @@ -123,6 +127,7 @@ extern __printf(3, 4) void ufs_error(struct super_block *, const char *, const char *, ...); extern __printf(3, 4) void ufs_panic(struct super_block *, const char *, const char *, ...); +void ufs_mark_sb_dirty(struct super_block *sb); /* symlink.c */ extern const struct inode_operations ufs_fast_symlink_inode_operations; diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 8aba544f9fa..0cbd5d340b6 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -34,6 +34,7 @@ #include <linux/kernel.h> #include <linux/stat.h> #include <linux/fs.h> +#include <linux/workqueue.h> #include <asm/div64.h> typedef __u64 __bitwise __fs64; diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 9d1aeb7e273..4f33c32affe 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1074,13 +1074,13 @@ restart: * If we couldn't get anything, give up. */ if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + if (!forced++) { trace_xfs_alloc_near_busy(args); xfs_log_force(args->mp, XFS_LOG_SYNC); goto restart; } - - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; return 0; @@ -2434,13 +2434,22 @@ xfs_alloc_vextent_worker( current_restore_flags_nested(&pflags, PF_FSTRANS); } - -int /* error */ +/* + * Data allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Metadata + * requests, OTOH, are generally from low stack usage paths, so avoid the + * context switch overhead here. + */ +int xfs_alloc_vextent( - xfs_alloc_arg_t *args) /* allocation argument structure */ + struct xfs_alloc_arg *args) { DECLARE_COMPLETION_ONSTACK(done); + if (!args->userdata) + return __xfs_alloc_vextent(args); + + args->done = &done; INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); queue_work(xfs_alloc_wq, &args->work); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a4beb421018..269b35c084d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -989,27 +989,6 @@ xfs_buf_ioerror_alert( (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); } -int -xfs_bwrite( - struct xfs_buf *bp) -{ - int error; - - ASSERT(xfs_buf_islocked(bp)); - - bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); - - xfs_bdstrat_cb(bp); - - error = xfs_buf_iowait(bp); - if (error) { - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_META_IO_ERROR); - } - return error; -} - /* * Called when we want to stop a buffer from getting written or read. * We attach the EIO error, muck with its flags, and call xfs_buf_ioend @@ -1079,14 +1058,7 @@ xfs_bioerror_relse( return EIO; } - -/* - * All xfs metadata buffers except log state machine buffers - * get this attached as their b_bdstrat callback function. - * This is so that we can catch a buffer - * after prematurely unpinning it to forcibly shutdown the filesystem. - */ -int +STATIC int xfs_bdstrat_cb( struct xfs_buf *bp) { @@ -1107,6 +1079,27 @@ xfs_bdstrat_cb( return 0; } +int +xfs_bwrite( + struct xfs_buf *bp) +{ + int error; + + ASSERT(xfs_buf_islocked(bp)); + + bp->b_flags |= XBF_WRITE; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); + + xfs_bdstrat_cb(bp); + + error = xfs_buf_iowait(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } + return error; +} + /* * Wrapper around bdstrat so that we can stop data from going to disk in case * we are shutting down the filesystem. Typically user data goes thru this @@ -1243,7 +1236,7 @@ xfs_buf_iorequest( */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 0); + _xfs_buf_ioend(bp, 1); xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7f1d1392ce3..79344c48008 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -180,7 +180,6 @@ extern void xfs_buf_unlock(xfs_buf_t *); extern int xfs_bwrite(struct xfs_buf *bp); extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); -extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 45df2b857d4..d9e451115f9 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -954,7 +954,7 @@ xfs_buf_iodone_callbacks( if (!XFS_BUF_ISSTALE(bp)) { bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; - xfs_bdstrat_cb(bp); + xfs_buf_iorequest(bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3a05a41b5d7..1f1535d25a9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -208,6 +208,7 @@ xfs_open_by_handle( struct inode *inode; struct dentry *dentry; fmode_t fmode; + struct path path; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); @@ -252,8 +253,10 @@ xfs_open_by_handle( goto out_dput; } - filp = dentry_open(dentry, mntget(parfilp->f_path.mnt), - hreq->oflags, cred); + path.mnt = parfilp->f_path.mnt; + path.dentry = dentry; + filp = dentry_open(&path, hreq->oflags, cred); + dput(dentry); if (IS_ERR(filp)) { put_unused_fd(fd); return PTR_ERR(filp); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1a25fd80279..9c4340f5c3e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -179,7 +179,7 @@ xfs_vn_create( struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool flags) { return xfs_vn_mknod(dir, dentry, mode, 0); } @@ -197,7 +197,7 @@ STATIC struct dentry * xfs_vn_lookup( struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct xfs_inode *cip; struct xfs_name name; @@ -222,7 +222,7 @@ STATIC struct dentry * xfs_vn_ci_lookup( struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct xfs_inode *ip; struct xfs_name xname; |