diff options
Diffstat (limited to 'fs')
143 files changed, 4508 insertions, 3009 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 332b5ff02fe..f7003cfac63 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -76,7 +76,7 @@ static const match_table_t tokens = { * Return 0 upon success, -ERRNO upon failure. */ -static int v9fs_parse_options(struct v9fs_session_info *v9ses) +static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) { char *options; substring_t args[MAX_OPT_ARGS]; @@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses) v9ses->debug = 0; v9ses->cache = 0; - if (!v9ses->options) + if (!opts) return 0; - options = kstrdup(v9ses->options, GFP_KERNEL); + options = kstrdup(opts, GFP_KERNEL); if (!options) { P9_DPRINTK(P9_DEBUG_ERROR, "failed to allocate copy of option string\n"); @@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->uid = ~0; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; - if (data) { - v9ses->options = kstrdup(data, GFP_KERNEL); - if (!v9ses->options) { - P9_DPRINTK(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - retval = -ENOMEM; - goto error; - } - } - rc = v9fs_parse_options(v9ses); + rc = v9fs_parse_options(v9ses, data); if (rc < 0) { retval = rc; goto error; } - v9ses->clnt = p9_client_create(dev_name, v9ses->options); - + v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { retval = PTR_ERR(v9ses->clnt); v9ses->clnt = NULL; @@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) __putname(v9ses->uname); __putname(v9ses->aname); - kfree(v9ses->options); } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index a7d56719299..38762bf102a 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -85,7 +85,6 @@ struct v9fs_session_info { unsigned int afid; unsigned int cache; - char *options; /* copy of mount options */ char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 81f8bbf12f9..06a223d50a8 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended) /** * v9fs_blank_wstat - helper function to setup a 9P stat structure - * @v9ses: 9P session info (for determining extended mode) * @wstat: structure to initialize * */ @@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat) struct inode *v9fs_get_inode(struct super_block *sb, int mode) { + int err; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); inode = new_inode(sb); - if (inode) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_blocks = 0; - inode->i_rdev = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->a_ops = &v9fs_addr_operations; - - switch (mode & S_IFMT) { - case S_IFIFO: - case S_IFBLK: - case S_IFCHR: - case S_IFSOCK: - if (!v9fs_extended(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "special files without extended mode\n"); - return ERR_PTR(-EINVAL); - } - init_special_inode(inode, inode->i_mode, - inode->i_rdev); - break; - case S_IFREG: - inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; - break; - case S_IFLNK: - if (!v9fs_extended(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "extended modes used w/o 9P2000.u\n"); - return ERR_PTR(-EINVAL); - } - inode->i_op = &v9fs_symlink_inode_operations; - break; - case S_IFDIR: - inc_nlink(inode); - if (v9fs_extended(v9ses)) - inode->i_op = &v9fs_dir_inode_operations_ext; - else - inode->i_op = &v9fs_dir_inode_operations; - inode->i_fop = &v9fs_dir_operations; - break; - default: - P9_DPRINTK(P9_DEBUG_ERROR, - "BAD mode 0x%x S_IFMT 0x%x\n", - mode, mode & S_IFMT); - return ERR_PTR(-EINVAL); - } - } else { + if (!inode) { P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); return ERR_PTR(-ENOMEM); } + + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_blocks = 0; + inode->i_rdev = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->a_ops = &v9fs_addr_operations; + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + if (!v9fs_extended(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, + "special files without extended mode\n"); + err = -EINVAL; + goto error; + } + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + case S_IFREG: + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + break; + case S_IFLNK: + if (!v9fs_extended(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, + "extended modes used w/o 9P2000.u\n"); + err = -EINVAL; + goto error; + } + inode->i_op = &v9fs_symlink_inode_operations; + break; + case S_IFDIR: + inc_nlink(inode); + if (v9fs_extended(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_ext; + else + inode->i_op = &v9fs_dir_inode_operations; + inode->i_fop = &v9fs_dir_operations; + break; + default: + P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", + mode, mode & S_IFMT); + err = -EINVAL; + goto error; + } + return inode; + +error: + iput(inode); + return ERR_PTR(err); } /* @@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, ret = NULL; st = p9_client_stat(fid); - if (IS_ERR(st)) { - err = PTR_ERR(st); - st = NULL; - goto error; - } + if (IS_ERR(st)) + return ERR_CAST(st); umode = p9mode2unixmode(v9ses, st->mode); ret = v9fs_get_inode(sb, umode); if (IS_ERR(ret)) { err = PTR_ERR(ret); - ret = NULL; goto error; } v9fs_stat2inode(st, ret, sb); ret->i_ino = v9fs_qid2ino(&st->qid); + p9stat_free(st); kfree(st); return ret; error: + p9stat_free(st); kfree(st); - if (ret) - iput(ret); - return ERR_PTR(err); } @@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file) * @v9ses: session information * @dir: directory that dentry is being created in * @dentry: dentry that is being created + * @extension: 9p2000.u extension string to support devices, etc. * @perm: create permissions * @mode: open mode - * @extension: 9p2000.u extension string to support devices, etc. * */ static struct p9_fid * @@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); - v9fs_fid_add(dentry, fid); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + return ofid; error: diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 38d695d66a0..8961f1a8f66 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data) static void v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, - int flags) + int flags, void *data) { sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); @@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | MS_NOATIME; + + save_mount_options(sb, data); } /** @@ -113,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct v9fs_session_info *v9ses = NULL; struct p9_wstat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; - uid_t uid = current_fsuid(); - gid_t gid = current_fsgid(); struct p9_fid *fid; int retval = 0; P9_DPRINTK(P9_DEBUG_VFS, " \n"); - st = NULL; v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) return -ENOMEM; @@ -142,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto free_stat; } - v9fs_fill_super(sb, v9ses, flags); + v9fs_fill_super(sb, v9ses, flags, data); inode = v9fs_get_inode(sb, S_IFDIR | mode); if (IS_ERR(inode)) { @@ -150,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, goto release_sb; } - inode->i_uid = uid; - inode->i_gid = gid; - root = d_alloc_root(inode); if (!root) { iput(inode); @@ -173,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); simple_set_mnt(mnt, sb); return 0; -release_sb: - deactivate_locked_super(sb); - free_stat: + p9stat_free(st); kfree(st); clunk_fid: @@ -185,7 +179,12 @@ clunk_fid: close_session: v9fs_session_close(v9ses); kfree(v9ses); + return retval; +release_sb: + p9stat_free(st); + kfree(st); + deactivate_locked_super(sb); return retval; } @@ -207,24 +206,10 @@ static void v9fs_kill_super(struct super_block *s) v9fs_session_close(v9ses); kfree(v9ses); + s->s_fs_info = NULL; P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); } -/** - * v9fs_show_options - Show mount options in /proc/mounts - * @m: seq_file to write to - * @mnt: mount descriptor - * - */ - -static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt) -{ - struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info; - - seq_printf(m, "%s", v9ses->options); - return 0; -} - static void v9fs_umount_begin(struct super_block *sb) { @@ -237,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb) static const struct super_operations v9fs_super_ops = { .statfs = simple_statfs, .clear_inode = v9fs_clear_inode, - .show_options = v9fs_show_options, + .show_options = generic_show_options, .umount_begin = v9fs_umount_begin, }; diff --git a/fs/Kconfig b/fs/Kconfig index 0e7da7bb5d9..455aa207e67 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -43,6 +43,7 @@ source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" +source "fs/nilfs2/Kconfig" endif # BLOCK @@ -186,7 +187,6 @@ source "fs/romfs/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" -source "fs/nilfs2/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/afs/file.c b/fs/afs/file.c index 0149dab365e..681c2a7b013 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page) inode = page->mapping->host; - ASSERT(file != NULL); - key = file->private_data; - ASSERT(key != NULL); + if (file) { + key = file->private_data; + ASSERT(key != NULL); + } else { + key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error_nokey; + } + } _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); @@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page) unlock_page(page); } + if (!file) + key_put(key); _leave(" = 0"); return 0; error: SetPageError(page); unlock_page(page); + if (!file) + key_put(key); +error_nokey: _leave(" = %d", ret); return ret; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index aa39ae83f01..3da18d45348 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) } /* Update the expiry counter if fs is busy */ - if (!may_umount_tree(mnt)) { + if (!may_umount_tree(path.mnt)) { struct autofs_info *ino = autofs4_dentry_ino(top); ino->last_used = jiffies; goto done; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b7c1603cd4b..7c1e65d5487 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, } } - /* - * Now fill out the bss section. First pad the last page up - * to the page boundary, and then perform a mmap to make sure - * that there are zero-mapped pages up to and including the - * last bss page. - */ - if (padzero(elf_bss)) { - error = -EFAULT; - goto out_close; - } + if (last_bss > elf_bss) { + /* + * Now fill out the bss section. First pad the last page up + * to the page boundary, and then perform a mmap to make sure + * that there are zero-mapped pages up to and including the + * last bss page. + */ + if (padzero(elf_bss)) { + error = -EFAULT; + goto out_close; + } - /* What we have mapped so far */ - elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); + /* What we have mapped so far */ + elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); - /* Map the last of the bss segment */ - if (last_bss > elf_bss) { + /* Map the last of the bss segment */ down_write(¤t->mm->mmap_sem); error = do_brk(elf_bss, last_bss - elf_bss); up_write(¤t->mm->mmap_sem); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e83be2e4602..15831d5c736 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; + bdi->name = "btrfs"; bdi->capabilities = BDI_CAP_MAP_COPY; err = bdi_init(bdi); if (err) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 272b9b2bea8..59cba180fe8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3099,8 +3099,12 @@ static void inode_tree_add(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_inode *entry; - struct rb_node **p = &root->inode_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p; + struct rb_node *parent; + +again: + p = &root->inode_tree.rb_node; + parent = NULL; spin_lock(&root->inode_lock); while (*p) { @@ -3108,13 +3112,16 @@ static void inode_tree_add(struct inode *inode) entry = rb_entry(parent, struct btrfs_inode, rb_node); if (inode->i_ino < entry->vfs_inode.i_ino) - p = &(*p)->rb_left; + p = &parent->rb_left; else if (inode->i_ino > entry->vfs_inode.i_ino) - p = &(*p)->rb_right; + p = &parent->rb_right; else { WARN_ON(!(entry->vfs_inode.i_state & (I_WILL_FREE | I_FREEING | I_CLEAR))); - break; + rb_erase(parent, &root->inode_tree); + RB_CLEAR_NODE(parent); + spin_unlock(&root->inode_lock); + goto again; } } rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); @@ -3126,12 +3133,12 @@ static void inode_tree_del(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + spin_lock(&root->inode_lock); if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - spin_lock(&root->inode_lock); rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - spin_unlock(&root->inode_lock); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); } + spin_unlock(&root->inode_lock); } static noinline void init_btrfs_i(struct inode *inode) diff --git a/fs/buffer.c b/fs/buffer.c index a3ef091a45b..90a98865b0c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -281,7 +281,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_pdflush(1024); + wakeup_flusher_threads(1024); yield(); for_each_online_node(nid) { @@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; - if (!TestSetPageDirty(page)) - __set_page_dirty(page, page_mapping(page), 0); + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + if (mapping) + __set_page_dirty(page, mapping, 0); + } } } diff --git a/fs/char_dev.c b/fs/char_dev.c index a173551e19d..3cbc57f932d 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -31,6 +31,7 @@ * - no readahead or I/O queue unplugging required */ struct backing_dev_info directly_mappable_cdev_bdi = { + .name = "char", .capabilities = ( #ifdef CONFIG_MMU /* permit private copies of the data to be taken */ @@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, } /** - * register_chrdev() - Register a major number for character devices. + * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * @@ -254,19 +257,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. - * - * This function registers a range of 256 minor numbers. The first minor number - * is 0. */ -int register_chrdev(unsigned int major, const char *name, - const struct file_operations *fops) +int __register_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name, + const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; char *s; int err = -ENOMEM; - cd = __register_chrdev_region(major, 0, 256, name); + cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); @@ -280,7 +281,7 @@ int register_chrdev(unsigned int major, const char *name, for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) *s = '!'; - err = cdev_add(cdev, MKDEV(cd->major, 0), 256); + err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; @@ -290,7 +291,7 @@ int register_chrdev(unsigned int major, const char *name, out: kobject_put(&cdev->kobj); out2: - kfree(__unregister_chrdev_region(cd->major, 0, 256)); + kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } @@ -316,10 +317,23 @@ void unregister_chrdev_region(dev_t from, unsigned count) } } -void unregister_chrdev(unsigned int major, const char *name) +/** + * __unregister_chrdev - unregister and destroy a cdev + * @major: major device number + * @baseminor: first of the range of minor numbers + * @count: the number of minor numbers this cdev is occupying + * @name: name of this range of devices + * + * Unregister and destroy the cdev occupying the region described by + * @major, @baseminor and @count. This function undoes what + * __register_chrdev() did. + */ +void __unregister_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name) { struct char_device_struct *cd; - cd = __unregister_chrdev_region(major, 0, 256); + + cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); @@ -568,6 +582,6 @@ EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_index); -EXPORT_SYMBOL(register_chrdev); -EXPORT_SYMBOL(unregister_chrdev); +EXPORT_SYMBOL(__register_chrdev); +EXPORT_SYMBOL(__unregister_chrdev); EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index e85b1e4389e..145540a316a 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -3,7 +3,10 @@ Version 1.60 Fix memory leak in reconnect. Fix oops in DFS mount error path. Set s_maxbytes to smaller (the max that vfs can handle) so that sendfile will now work over cifs mounts again. Add noforcegid -and noforceuid mount parameters. +and noforceuid mount parameters. Fix small mem leak when using +ntlmv2. Fix 2nd mount to same server but with different port to +be allowed (rather than reusing the 1st port) - only when the +user explicitly overrides the port on the 2nd mount. Version 1.59 ------------ diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 051caecf7d6..8ec7736ce95 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) if (server->addr.sockAddr.sin_family == AF_INET) sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); else if (server->addr.sockAddr.sin_family == AF_INET6) - sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr); + sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr); else goto out; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 6941c22398a..7dfe0842a6f 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return get_cifs_acl_by_path(cifs_sb, path, pacllen); pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); return pntsd; } @@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); return rc; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 7c9809523f4..7efe1745494 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -373,6 +373,7 @@ calc_exit_2: compare with the NTLM example */ hmac_md5_final(ses->server->ntlmv2_hash, pctxt); + kfree(pctxt); return rc; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 84b75253b05..3610e9958b4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) static int cifs_show_options(struct seq_file *s, struct vfsmount *m) { - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *tcon; - - cifs_sb = CIFS_SB(m->mnt_sb); - tcon = cifs_sb->tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); + struct cifsTconInfo *tcon = cifs_sb->tcon; - seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName); + seq_printf(s, ",unc=%s", tcon->treeName); if (tcon->ses->userName) seq_printf(s, ",username=%s", tcon->ses->userName); if (tcon->ses->domainName) @@ -989,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg) if (try_to_freeze()) continue; - spin_lock(&GlobalMid_Lock); - if (list_empty(&GlobalOplock_Q)) { - spin_unlock(&GlobalMid_Lock); + spin_lock(&cifs_oplock_lock); + if (list_empty(&cifs_oplock_list)) { + spin_unlock(&cifs_oplock_lock); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(39*HZ); } else { - oplock_item = list_entry(GlobalOplock_Q.next, + oplock_item = list_entry(cifs_oplock_list.next, struct oplock_q_entry, qhead); cFYI(1, ("found oplock item to write out")); pTcon = oplock_item->tcon; inode = oplock_item->pinode; netfid = oplock_item->netfid; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_oplock_lock); DeleteOplockQEntry(oplock_item); /* can not grab inode sem here since it would deadlock when oplock received on delete @@ -1058,7 +1055,7 @@ init_cifs(void) int rc = 0; cifs_proc_init(); INIT_LIST_HEAD(&cifs_tcp_ses_list); - INIT_LIST_HEAD(&GlobalOplock_Q); + INIT_LIST_HEAD(&cifs_oplock_list); #ifdef CONFIG_CIFS_EXPERIMENTAL INIT_LIST_HEAD(&GlobalDnotifyReqList); INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); @@ -1087,6 +1084,7 @@ init_cifs(void) rwlock_init(&GlobalSMBSeslock); rwlock_init(&cifs_tcp_ses_lock); spin_lock_init(&GlobalMid_Lock); + spin_lock_init(&cifs_oplock_lock); if (cifs_max_pending < 2) { cifs_max_pending = 2; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 6c170948300..094325e3f71 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.60" +#define CIFS_VERSION "1.61" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 6084d6379c0..6cfc81a3270 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -351,11 +351,24 @@ struct cifsFileInfo { bool closePend:1; /* file is marked to close */ bool invalidHandle:1; /* file closed via session abend */ bool messageMode:1; /* for pipes: message vs byte mode */ - atomic_t wrtPending; /* handle in use - defer close */ + atomic_t count; /* reference count */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; }; +/* Take a reference on the file private data */ +static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) +{ + atomic_inc(&cifs_file->count); +} + +/* Release a reference on the file private data */ +static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file) +{ + if (atomic_dec_and_test(&cifs_file->count)) + kfree(cifs_file); +} + /* * One of these for each file inode */ @@ -656,7 +669,11 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock; */ GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; -GLOBAL_EXTERN struct list_head GlobalOplock_Q; +/* Global list of oplocks */ +GLOBAL_EXTERN struct list_head cifs_oplock_list; + +/* Protects the cifs_oplock_list */ +GLOBAL_EXTERN spinlock_t cifs_oplock_lock; /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1866bc2927d..301e307e127 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon) to this tcon */ } -/* Allocate and return pointer to an SMB request buffer, and set basic - SMB information in the SMB header. If the return code is zero, this - function must have filled in request_buf pointer */ +/* reconnect the socket, tcon, and smb session if needed */ static int -small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, - void **request_buf) +cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) { int rc = 0; + struct cifsSesInfo *ses; + struct TCP_Server_Info *server; + struct nls_table *nls_codepage; - /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so - check for tcp and smb session status done differently - for those three - in the calling routine */ - if (tcon) { - if (tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if ((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1, ("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } + /* + * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for + * tcp and smb session status done differently for those three - in the + * calling routine + */ + if (!tcon) + return 0; + + ses = tcon->ses; + server = ses->server; + + /* + * only tree disconnect, open, and write, (and ulogoff which does not + * have tcon) are allowed as we start force umount + */ + if (tcon->tidStatus == CifsExiting) { + if (smb_command != SMB_COM_WRITE_ANDX && + smb_command != SMB_COM_OPEN_ANDX && + smb_command != SMB_COM_TREE_DISCONNECT) { + cFYI(1, ("can not send cmd %d while umounting", + smb_command)); + return -ENODEV; } - if ((tcon->ses) && (tcon->ses->status != CifsExiting) && - (tcon->ses->server)) { - struct nls_table *nls_codepage; - /* Give Demultiplex thread up to 10 seconds to - reconnect, should be greater than cifs socket - timeout which is 7 seconds */ - while (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - wait_event_interruptible_timeout(tcon->ses->server->response_q, - (tcon->ses->server->tcpStatus == - CifsGood), 10 * HZ); - if (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - /* on "soft" mounts we wait once */ - if (!tcon->retry || - (tcon->ses->status == CifsExiting)) { - cFYI(1, ("gave up waiting on " - "reconnect in smb_init")); - return -EHOSTDOWN; - } /* else "hard" mount - keep retrying - until process is killed or server - comes back on-line */ - } else /* TCP session is reestablished now */ - break; - } + } - nls_codepage = load_nls_default(); - /* need to prevent multiple threads trying to - simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); - if (tcon->ses->need_reconnect) - rc = cifs_setup_session(0, tcon->ses, - nls_codepage); - if (!rc && (tcon->need_reconnect)) { - mark_open_files_invalid(tcon); - rc = CIFSTCon(0, tcon->ses, tcon->treeName, - tcon, nls_codepage); - up(&tcon->ses->sesSem); - /* BB FIXME add code to check if wsize needs - update due to negotiated smb buffer size - shrinking */ - if (rc == 0) { - atomic_inc(&tconInfoReconnectCount); - /* tell server Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps( - 0 /* no xid */, - tcon, - NULL /* we do not know sb */, - NULL /* no vol info */); - } + if (ses->status == CifsExiting) + return -EIO; - cFYI(1, ("reconnect tcon rc = %d", rc)); - /* Removed call to reopen open files here. - It is safer (and faster) to reopen files - one at a time as needed in read and write */ - - /* Check if handle based operation so we - know whether we can continue or not without - returning to caller to reset file handle */ - switch (smb_command) { - case SMB_COM_READ_ANDX: - case SMB_COM_WRITE_ANDX: - case SMB_COM_CLOSE: - case SMB_COM_FIND_CLOSE2: - case SMB_COM_LOCKING_ANDX: { - unload_nls(nls_codepage); - return -EAGAIN; - } - } - } else { - up(&tcon->ses->sesSem); - } - unload_nls(nls_codepage); + /* + * Give demultiplex thread up to 10 seconds to reconnect, should be + * greater than cifs socket timeout which is 7 seconds + */ + while (server->tcpStatus == CifsNeedReconnect) { + wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus == CifsGood), 10 * HZ); - } else { - return -EIO; + /* is TCP session is reestablished now ?*/ + if (server->tcpStatus != CifsNeedReconnect) + break; + + /* + * on "soft" mounts we wait once. Hard mounts keep + * retrying until process is killed or server comes + * back on-line + */ + if (!tcon->retry || ses->status == CifsExiting) { + cFYI(1, ("gave up waiting on reconnect in smb_init")); + return -EHOSTDOWN; } } + + if (!ses->need_reconnect && !tcon->need_reconnect) + return 0; + + nls_codepage = load_nls_default(); + + /* + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ + down(&ses->sesSem); + if (ses->need_reconnect) + rc = cifs_setup_session(0, ses, nls_codepage); + + /* do we need to reconnect tcon? */ + if (rc || !tcon->need_reconnect) { + up(&ses->sesSem); + goto out; + } + + mark_open_files_invalid(tcon); + rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); + up(&ses->sesSem); + cFYI(1, ("reconnect tcon rc = %d", rc)); + + if (rc) + goto out; + + /* + * FIXME: check if wsize needs updated due to negotiated smb buffer + * size shrinking + */ + atomic_inc(&tconInfoReconnectCount); + + /* tell server Unix caps we support */ + if (ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(0, tcon, NULL, NULL); + + /* + * Removed call to reopen open files here. It is safer (and faster) to + * reopen files one at a time as needed in read and write. + * + * FIXME: what about file locks? don't we need to reclaim them ASAP? + */ + +out: + /* + * Check if handle based operation so we know whether we can continue + * or not without returning to caller to reset file handle + */ + switch (smb_command) { + case SMB_COM_READ_ANDX: + case SMB_COM_WRITE_ANDX: + case SMB_COM_CLOSE: + case SMB_COM_FIND_CLOSE2: + case SMB_COM_LOCKING_ANDX: + rc = -EAGAIN; + } + + unload_nls(nls_codepage); + return rc; +} + +/* Allocate and return pointer to an SMB request buffer, and set basic + SMB information in the SMB header. If the return code is zero, this + function must have filled in request_buf pointer */ +static int +small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf) +{ + int rc = 0; + + rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) return rc; @@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, { int rc = 0; - /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so - check for tcp and smb session status done differently - for those three - in the calling routine */ - if (tcon) { - if (tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if ((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1, ("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } - } - - if ((tcon->ses) && (tcon->ses->status != CifsExiting) && - (tcon->ses->server)) { - struct nls_table *nls_codepage; - /* Give Demultiplex thread up to 10 seconds to - reconnect, should be greater than cifs socket - timeout which is 7 seconds */ - while (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - wait_event_interruptible_timeout(tcon->ses->server->response_q, - (tcon->ses->server->tcpStatus == - CifsGood), 10 * HZ); - if (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - /* on "soft" mounts we wait once */ - if (!tcon->retry || - (tcon->ses->status == CifsExiting)) { - cFYI(1, ("gave up waiting on " - "reconnect in smb_init")); - return -EHOSTDOWN; - } /* else "hard" mount - keep retrying - until process is killed or server - comes on-line */ - } else /* TCP session is reestablished now */ - break; - } - nls_codepage = load_nls_default(); - /* need to prevent multiple threads trying to - simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); - if (tcon->ses->need_reconnect) - rc = cifs_setup_session(0, tcon->ses, - nls_codepage); - if (!rc && (tcon->need_reconnect)) { - mark_open_files_invalid(tcon); - rc = CIFSTCon(0, tcon->ses, tcon->treeName, - tcon, nls_codepage); - up(&tcon->ses->sesSem); - /* BB FIXME add code to check if wsize needs - update due to negotiated smb buffer size - shrinking */ - if (rc == 0) { - atomic_inc(&tconInfoReconnectCount); - /* tell server Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps( - 0 /* no xid */, - tcon, - NULL /* do not know sb */, - NULL /* no vol info */); - } - - cFYI(1, ("reconnect tcon rc = %d", rc)); - /* Removed call to reopen open files here. - It is safer (and faster) to reopen files - one at a time as needed in read and write */ - - /* Check if handle based operation so we - know whether we can continue or not without - returning to caller to reset file handle */ - switch (smb_command) { - case SMB_COM_READ_ANDX: - case SMB_COM_WRITE_ANDX: - case SMB_COM_CLOSE: - case SMB_COM_FIND_CLOSE2: - case SMB_COM_LOCKING_ANDX: { - unload_nls(nls_codepage); - return -EAGAIN; - } - } - } else { - up(&tcon->ses->sesSem); - } - unload_nls(nls_codepage); - - } else { - return -EIO; - } - } + rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) return rc; @@ -3961,6 +3895,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, if (is_unicode) { __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, GFP_KERNEL); + if (tmp == NULL) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } cifsConvertToUCS((__le16 *) tmp, searchName, PATH_MAX, nls_codepage, remap); node->path_consumed = cifs_ucs2_bytes(tmp, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1f3345d7fa7..d49682433c2 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1377,7 +1377,7 @@ cifs_parse_mount_options(char *options, const char *devname, } static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr_storage *addr) +cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) { struct list_head *tmp; struct TCP_Server_Info *server; @@ -1397,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr) if (server->tcpStatus == CifsNew) continue; - if (addr->ss_family == AF_INET && - (addr4->sin_addr.s_addr != - server->addr.sockAddr.sin_addr.s_addr)) - continue; - else if (addr->ss_family == AF_INET6 && - (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, - &addr6->sin6_addr) || - server->addr.sockAddr6.sin6_scope_id != - addr6->sin6_scope_id)) - continue; + switch (addr->ss_family) { + case AF_INET: + if (addr4->sin_addr.s_addr == + server->addr.sockAddr.sin_addr.s_addr) { + addr4->sin_port = htons(port); + /* user overrode default port? */ + if (addr4->sin_port) { + if (addr4->sin_port != + server->addr.sockAddr.sin_port) + continue; + } + break; + } else + continue; + + case AF_INET6: + if (ipv6_addr_equal(&addr6->sin6_addr, + &server->addr.sockAddr6.sin6_addr) && + (addr6->sin6_scope_id == + server->addr.sockAddr6.sin6_scope_id)) { + addr6->sin6_port = htons(port); + /* user overrode default port? */ + if (addr6->sin6_port) { + if (addr6->sin6_port != + server->addr.sockAddr6.sin6_port) + continue; + } + break; + } else + continue; + } ++server->srv_count; write_unlock(&cifs_tcp_ses_lock); @@ -1475,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) } /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session(&addr); + tcp_ses = cifs_find_tcp_session(&addr, volume_info->port); if (tcp_ses) return tcp_ses; @@ -2636,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, return -EIO; smb_buffer = cifs_buf_get(); - if (smb_buffer == NULL) { + if (smb_buffer == NULL) return -ENOMEM; - } + smb_buffer_response = smb_buffer; header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 4326ffd90fa..a6424cfc012 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, mutex_init(&pCifsFile->fh_mutex); mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); - atomic_set(&pCifsFile->wrtPending, 0); + atomic_set(&pCifsFile->count, 1); /* set the following in open now pCifsFile->pfile = file; */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c34b7f8a217..fa7beac8b80 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private( private_data->pInode = inode; private_data->invalidHandle = false; private_data->closePend = false; - /* we have to track num writers to the inode, since writepages - does not tell us which handle the write is for so there can - be a close (overlapping with write) of the filehandle that - cifs_writepages chose to use */ - atomic_set(&private_data->wrtPending, 0); + /* Initialize reference count to one. The private data is + freed on the release of the last reference */ + atomic_set(&private_data->count, 1); return private_data; } @@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file) if (!pTcon->need_reconnect) { write_unlock(&GlobalSMBSeslock); timeout = 2; - while ((atomic_read(&pSMBFile->wrtPending) != 0) + while ((atomic_read(&pSMBFile->count) != 1) && (timeout <= 2048)) { /* Give write a better chance to get to server ahead of the close. We do not @@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file) msleep(timeout); timeout *= 4; } - if (atomic_read(&pSMBFile->wrtPending)) - cERROR(1, ("close with pending write")); if (!pTcon->need_reconnect && !pSMBFile->invalidHandle) rc = CIFSSMBClose(xid, pTcon, @@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file) list_del(&pSMBFile->flist); list_del(&pSMBFile->tlist); write_unlock(&GlobalSMBSeslock); - timeout = 10; - /* We waited above to give the SMBWrite a chance to issue - on the wire (so we do not get SMBWrite returning EBADF - if writepages is racing with close. Note that writepages - does not specify a file handle, so it is possible for a file - to be opened twice, and the application close the "wrong" - file handle - in these cases we delay long enough to allow - the SMBWrite to get on the wire before the SMB Close. - We allow total wait here over 45 seconds, more than - oplock break time, and more than enough to allow any write - to complete on the server, or to time out on the client */ - while ((atomic_read(&pSMBFile->wrtPending) != 0) - && (timeout <= 50000)) { - cERROR(1, ("writes pending, delay free of handle")); - msleep(timeout); - timeout *= 8; - } - kfree(file->private_data); + cifsFileInfo_put(file->private_data); file->private_data = NULL; } else rc = -EBADF; @@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) if (!open_file->invalidHandle) { /* found a good file */ /* lock it so it will not be closed on us */ - atomic_inc(&open_file->wrtPending); + cifsFileInfo_get(open_file); read_unlock(&GlobalSMBSeslock); return open_file; } /* else might as well continue, and look for @@ -1276,7 +1255,7 @@ refind_writable: if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || (open_file->pfile->f_flags & O_WRONLY))) { - atomic_inc(&open_file->wrtPending); + cifsFileInfo_get(open_file); if (!open_file->invalidHandle) { /* found a good writable file */ @@ -1293,7 +1272,7 @@ refind_writable: else { /* start over in case this was deleted */ /* since the list could be modified */ read_lock(&GlobalSMBSeslock); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); goto refind_writable; } } @@ -1309,7 +1288,7 @@ refind_writable: read_lock(&GlobalSMBSeslock); /* can not use this handle, no write pending on this one after all */ - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); if (open_file->closePend) /* list could have changed */ goto refind_writable; @@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) if (open_file) { bytes_written = cifs_write(open_file->pfile, write_data, to-from, &offset); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); /* Does mm or vfs already set times? */ inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); if ((bytes_written > 0) && (offset)) @@ -1562,7 +1541,7 @@ retry: bytes_to_write, offset, &bytes_written, iov, n_iov, long_op); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); cifs_update_eof(cifsi, offset, bytes_written); if (rc || bytes_written < bytes_to_write) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 82d83839655..1f09c761931 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -800,7 +800,7 @@ set_via_filehandle: if (open_file == NULL) CIFSSMBClose(xid, pTcon, netfid); else - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); out: return rc; } @@ -1635,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, __u32 npid = open_file->pid; rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, npid, false); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); cFYI(1, ("SetFSize for attrs rc = %d", rc)); if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { unsigned int bytes_written; @@ -1790,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) u16 nfid = open_file->netfid; u32 npid = open_file->pid; rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); } else { rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, cifs_sb->local_nls, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 0ad3e2d116a..1da4ab250ea 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon) temp->pinode = pinode; temp->tcon = tcon; temp->netfid = fid; - spin_lock(&GlobalMid_Lock); - list_add_tail(&temp->qhead, &GlobalOplock_Q); - spin_unlock(&GlobalMid_Lock); + spin_lock(&cifs_oplock_lock); + list_add_tail(&temp->qhead, &cifs_oplock_list); + spin_unlock(&cifs_oplock_lock); } return temp; - } void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry) { - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_oplock_lock); /* should we check if list empty first? */ list_del(&oplockEntry->qhead); - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_oplock_lock); kmem_cache_free(cifs_oplock_cachep, oplockEntry); } @@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon) if (tcon == NULL) return; - spin_lock(&GlobalMid_Lock); - list_for_each_entry(temp, &GlobalOplock_Q, qhead) { + spin_lock(&cifs_oplock_lock); + list_for_each_entry(temp, &cifs_oplock_list, qhead) { if ((temp->tcon) && (temp->tcon == tcon)) { list_del(&temp->qhead); kmem_cache_free(cifs_oplock_cachep, temp); } } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_oplock_lock); } static int diff --git a/fs/compat.c b/fs/compat.c index 94502dab972..6d6f98fe64a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename, if (!bprm) goto out_files; - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + retval = prepare_bprm_creds(bprm); + if (retval) goto out_free; - current->in_execve = 1; - - retval = -ENOMEM; - bprm->cred = prepare_exec_creds(); - if (!bprm->cred) - goto out_unlock; retval = check_unsafe_exec(bprm); if (retval < 0) - goto out_unlock; + goto out_free; clear_in_exec = retval; + current->in_execve = 1; file = open_exec(filename); retval = PTR_ERR(file); @@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1567,10 +1561,7 @@ out_file: out_unmark: if (clear_in_exec) current->fs->in_exec = 0; - -out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 4921e7426d9..a2f746066c5 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = { }; static struct backing_dev_info configfs_backing_dev_info = { + .name = "configfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c3a6b..a100fa35a48 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include <linux/swap.h> #include <linux/bootmem.h> #include <linux/fs_struct.h> +#include <linux/hardirq.h> #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index ccc9d62c462..55ea369f43a 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb) return rv; } - return genlmsg_unicast(skb, listener_nlpid); + return genlmsg_unicast(&init_net, skb, listener_nlpid); } static int user_cmd(struct sk_buff *skb, struct genl_info *info) diff --git a/fs/exec.c b/fs/exec.c index 4a8849e45b2..172ceb6edde 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -678,8 +678,8 @@ exit: } EXPORT_SYMBOL(open_exec); -int kernel_read(struct file *file, unsigned long offset, - char *addr, unsigned long count) +int kernel_read(struct file *file, loff_t offset, + char *addr, unsigned long count) { mm_segment_t old_fs; loff_t pos = offset; @@ -1016,6 +1016,35 @@ out: EXPORT_SYMBOL(flush_old_exec); /* + * Prepare credentials and lock ->cred_guard_mutex. + * install_exec_creds() commits the new creds and drops the lock. + * Or, if exec fails before, free_bprm() should release ->cred and + * and unlock. + */ +int prepare_bprm_creds(struct linux_binprm *bprm) +{ + if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + return -ERESTARTNOINTR; + + bprm->cred = prepare_exec_creds(); + if (likely(bprm->cred)) + return 0; + + mutex_unlock(¤t->cred_guard_mutex); + return -ENOMEM; +} + +void free_bprm(struct linux_binprm *bprm) +{ + free_arg_pages(bprm); + if (bprm->cred) { + mutex_unlock(¤t->cred_guard_mutex); + abort_creds(bprm->cred); + } + kfree(bprm); +} + +/* * install the new credentials for this executable */ void install_exec_creds(struct linux_binprm *bprm) @@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; - - /* cred_guard_mutex must be held at least to this point to prevent + /* + * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's - * credentials; any time after this it may be unlocked */ - + * credentials; any time after this it may be unlocked. + */ security_bprm_committed_creds(bprm); + mutex_unlock(¤t->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); @@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) EXPORT_SYMBOL(search_binary_handler); -void free_bprm(struct linux_binprm *bprm) -{ - free_arg_pages(bprm); - if (bprm->cred) - abort_creds(bprm->cred); - kfree(bprm); -} - /* * sys_execve() executes a new program. */ @@ -1277,20 +1299,15 @@ int do_execve(char * filename, if (!bprm) goto out_files; - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + retval = prepare_bprm_creds(bprm); + if (retval) goto out_free; - current->in_execve = 1; - - retval = -ENOMEM; - bprm->cred = prepare_exec_creds(); - if (!bprm->cred) - goto out_unlock; retval = check_unsafe_exec(bprm); if (retval < 0) - goto out_unlock; + goto out_free; clear_in_exec = retval; + current->in_execve = 1; file = open_exec(filename); retval = PTR_ERR(file); @@ -1340,7 +1357,6 @@ int do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1360,10 +1376,7 @@ out_file: out_unmark: if (clear_in_exec) current->fs->in_exec = 0; - -out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index d636e1297ca..a63d44256a7 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -static int +int ext2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); @@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext2_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index ecefe478898..3ff6cbb9ac4 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_permission (struct inode *, int); +extern int ext2_check_acl (struct inode *, int); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); #else #include <linux/sched.h> -#define ext2_permission NULL +#define ext2_check_acl NULL #define ext2_get_acl NULL #define ext2_set_acl NULL diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 2b9e47dc922..a2f3afd1a1c 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index e1dedb0f787..23701f289e9 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, if (dir_de) { if (old_dir != new_dir) ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } inode_dec_link_count(old_dir); } return 0; @@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; const struct inode_operations ext2_special_inode_operations = { @@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig index fb3c1a21b13..522b15498f4 100644 --- a/fs/ext3/Kconfig +++ b/fs/ext3/Kconfig @@ -29,23 +29,25 @@ config EXT3_FS module will be called ext3. config EXT3_DEFAULTS_TO_ORDERED - bool "Default to 'data=ordered' in ext3 (legacy option)" + bool "Default to 'data=ordered' in ext3" depends on EXT3_FS help - If a filesystem does not explicitly specify a data ordering - mode, and the journal capability allowed it, ext3 used to - historically default to 'data=ordered'. - - That was a rather unfortunate choice, because it leads to all - kinds of latency problems, and the 'data=writeback' mode is more - appropriate these days. - - You should probably always answer 'n' here, and if you really - want to use 'data=ordered' mode, set it in the filesystem itself - with 'tune2fs -o journal_data_ordered'. - - But if you really want to enable the legacy default, you can do - so by answering 'y' to this question. + The journal mode options for ext3 have different tradeoffs + between when data is guaranteed to be on disk and + performance. The use of "data=writeback" can cause + unwritten data to appear in files after an system crash or + power failure, which can be a security issue. However, + "data=ordered" mode can also result in major performance + problems, including seconds-long delays before an fsync() + call returns. For details, see: + + http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs + + If you have been historically happy with ext3's performance, + data=ordered mode will be a safe choice and you should + answer 'y' here. If you understand the reliability and data + privacy issues of data=writeback and are willing to make + that trade off, answer 'n'. config EXT3_FS_XATTR bool "Ext3 extended attributes" diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index e167bae37ef..c9b0df376b5 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext3_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); @@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext3_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext3_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext3_new_inode. * diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 07d15a3a596..597334626de 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_permission (struct inode *, int); +extern int ext3_check_acl (struct inode *, int); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext3_permission NULL +#define ext3_check_acl NULL static inline int ext3_acl_chmod(struct inode *inode) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 5b49704b231..29925321478 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -137,7 +137,7 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 6ff7b973023..aad6400c9b7 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 524b349c629..a8d80a7f110 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl #endif } +static char *data_mode_string(unsigned long mode) +{ + switch (mode) { + case EXT3_MOUNT_JOURNAL_DATA: + return "journal"; + case EXT3_MOUNT_ORDERED_DATA: + return "ordered"; + case EXT3_MOUNT_WRITEBACK_DATA: + return "writeback"; + } + return "unknown"; +} + /* * Show an option if * - it's set to a non-default value OR @@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) - seq_puts(seq, ",data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) - seq_puts(seq, ",data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) - seq_puts(seq, ",data=writeback"); - + seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & + EXT3_MOUNT_DATA_FLAGS)); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb, datacheck: if (is_remount) { if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) - != data_opt) { - printk(KERN_ERR - "EXT3-fs: cannot change data " - "mode on remount\n"); - return 0; - } + == data_opt) + break; + printk(KERN_ERR + "EXT3-fs (device %s): Cannot change " + "data mode on remount. The filesystem " + "is mounted in data=%s mode and you " + "try to remount it in data=%s mode.\n", + sb->s_id, + data_mode_string(sbi->s_mount_opt & + EXT3_MOUNT_DATA_FLAGS), + data_mode_string(data_opt)); + return 0; } else { sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; sbi->s_mount_opt |= data_opt; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index f6d8967149c..0df88b2a69b 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext4_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); @@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext4_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext4_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 949789d2bba..9d843d5deac 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_permission(struct inode *, int); +extern int ext4_check_acl(struct inode *, int); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext4_permission NULL +#define ext4_check_acl NULL static inline int ext4_acl_chmod(struct inode *inode) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3f1873fef1c..27f3c5354c0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -207,7 +207,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fallocate = ext4_fallocate, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index de04013d16f..114abe5d2c1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2536,7 +2536,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fiemap = ext4_fiemap, }; @@ -2548,5 +2548,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, }; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c54226be529..da86ef58e42 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -19,171 +19,223 @@ #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> #include "internal.h" +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) -/** - * writeback_acquire - attempt to get exclusive writeback access to a device - * @bdi: the device's backing_dev_info structure - * - * It is a waste of resources to have more than one pdflush thread blocked on - * a single request queue. Exclusion at the request_queue level is obtained - * via a flag in the request_queue's backing_dev_info.state. - * - * Non-request_queue-backed address_spaces will share default_backing_dev_info, - * unless they implement their own. Which is somewhat inefficient, as this - * may prevent concurrent writeback against multiple devices. +/* + * We don't actually have pdflush, but this one is exported though /proc... */ -static int writeback_acquire(struct backing_dev_info *bdi) +int nr_pdflush_threads; + +/* + * Work items for the bdi_writeback threads + */ +struct bdi_work { + struct list_head list; + struct list_head wait_list; + struct rcu_head rcu_head; + + unsigned long seen; + atomic_t pending; + + struct super_block *sb; + unsigned long nr_pages; + enum writeback_sync_modes sync_mode; + + unsigned long state; +}; + +enum { + WS_USED_B = 0, + WS_ONSTACK_B, +}; + +#define WS_USED (1 << WS_USED_B) +#define WS_ONSTACK (1 << WS_ONSTACK_B) + +static inline bool bdi_work_on_stack(struct bdi_work *work) { - return !test_and_set_bit(BDI_pdflush, &bdi->state); + return test_bit(WS_ONSTACK_B, &work->state); +} + +static inline void bdi_work_init(struct bdi_work *work, + struct writeback_control *wbc) +{ + INIT_RCU_HEAD(&work->rcu_head); + work->sb = wbc->sb; + work->nr_pages = wbc->nr_to_write; + work->sync_mode = wbc->sync_mode; + work->state = WS_USED; +} + +static inline void bdi_work_init_on_stack(struct bdi_work *work, + struct writeback_control *wbc) +{ + bdi_work_init(work, wbc); + work->state |= WS_ONSTACK; } /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. * - * Determine whether there is writeback in progress against a backing device. + * Determine whether there is writeback waiting to be handled against a + * backing device. */ int writeback_in_progress(struct backing_dev_info *bdi) { - return test_bit(BDI_pdflush, &bdi->state); + return !list_empty(&bdi->work_list); } -/** - * writeback_release - relinquish exclusive writeback access against a device. - * @bdi: the device's backing_dev_info structure - */ -static void writeback_release(struct backing_dev_info *bdi) +static void bdi_work_clear(struct bdi_work *work) { - BUG_ON(!writeback_in_progress(bdi)); - clear_bit(BDI_pdflush, &bdi->state); + clear_bit(WS_USED_B, &work->state); + smp_mb__after_clear_bit(); + wake_up_bit(&work->state, WS_USED_B); } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) +static void bdi_work_free(struct rcu_head *head) { - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; + struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } + if (!bdi_work_on_stack(work)) + kfree(work); + else + bdi_work_clear(work); } -/** - * __mark_inode_dirty - internal function - * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. - * - * Put the inode on the super block's dirty list. - * - * CAREFUL! We mark it dirty unconditionally, but move it onto the - * dirty list only if it is hashed or if it refers to a blockdev. - * If it was not hashed, it will never be added to the dirty list - * even if it is later hashed, as it will have been marked dirty already. - * - * In short, make sure you hash any inodes _before_ you start marking - * them dirty. - * - * This function *must* be atomic for the I_DIRTY_PAGES case - - * set_page_dirty() is called under spinlock in several places. - * - * Note that for blockdevs, inode->dirtied_when represents the dirtying time of - * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of - * the kernel-internal blockdev inode represents the dirtying time of the - * blockdev's pages. This is why for I_DIRTY_PAGES we always use - * page->mapping->host, so the page-dirtying time is recorded in the internal - * blockdev inode. - */ -void __mark_inode_dirty(struct inode *inode, int flags) +static void wb_work_complete(struct bdi_work *work) { - struct super_block *sb = inode->i_sb; + const enum writeback_sync_modes sync_mode = work->sync_mode; /* - * Don't do this for I_DIRTY_PAGES - that doesn't actually - * dirty the inode itself + * For allocated work, we can clear the done/seen bit right here. + * For on-stack work, we need to postpone both the clear and free + * to after the RCU grace period, since the stack could be invalidated + * as soon as bdi_work_clear() has done the wakeup. */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode); - } + if (!bdi_work_on_stack(work)) + bdi_work_clear(work); + if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work)) + call_rcu(&work->rcu_head, bdi_work_free); +} +static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) +{ /* - * make sure that changes are seen by all cpus before we test i_state - * -- mikulas + * The caller has retrieved the work arguments from this work, + * drop our reference. If this is the last ref, delete and free it */ - smp_mb(); + if (atomic_dec_and_test(&work->pending)) { + struct backing_dev_info *bdi = wb->bdi; - /* avoid the locking if we can */ - if ((inode->i_state & flags) == flags) - return; + spin_lock(&bdi->wb_lock); + list_del_rcu(&work->list); + spin_unlock(&bdi->wb_lock); - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); - - spin_lock(&inode_lock); - if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + wb_work_complete(work); + } +} - inode->i_state |= flags; +static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) +{ + if (work) { + work->seen = bdi->wb_mask; + BUG_ON(!work->seen); + atomic_set(&work->pending, bdi->wb_cnt); + BUG_ON(!bdi->wb_cnt); /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. + * Make sure stores are seen before it appears on the list */ - if (inode->i_state & I_SYNC) - goto out; + smp_mb(); - /* - * Only add valid (hashed) inodes to the superblock's - * dirty list. Add blockdev inodes as well. - */ - if (!S_ISBLK(inode->i_mode)) { - if (hlist_unhashed(&inode->i_hash)) - goto out; - } - if (inode->i_state & (I_FREEING|I_CLEAR)) - goto out; + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&work->list, &bdi->work_list); + spin_unlock(&bdi->wb_lock); + } + + /* + * If the default thread isn't there, make sure we add it. When + * it gets created and wakes up, we'll run this work. + */ + if (unlikely(list_empty_careful(&bdi->wb_list))) + wake_up_process(default_backing_dev_info.wb.task); + else { + struct bdi_writeback *wb = &bdi->wb; /* - * If the inode was already on s_dirty/s_io/s_more_io, don't - * reposition it (that would break s_dirty time-ordering). + * If we failed allocating the bdi work item, wake up the wb + * thread always. As a safety precaution, it'll flush out + * everything */ - if (!was_dirty) { - inode->dirtied_when = jiffies; - list_move(&inode->i_list, &sb->s_dirty); - } + if (!wb_has_dirty_io(wb)) { + if (work) + wb_clear_pending(wb, work); + } else if (wb->task) + wake_up_process(wb->task); } -out: - spin_unlock(&inode_lock); } -EXPORT_SYMBOL(__mark_inode_dirty); +/* + * Used for on-stack allocated work items. The caller needs to wait until + * the wb threads have acked the work before it's safe to continue. + */ +static void bdi_wait_on_work_clear(struct bdi_work *work) +{ + wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); +} -static int write_inode(struct inode *inode, int sync) +static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc) { - if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); - return 0; + struct bdi_work *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) + bdi_work_init(work, wbc); + + return work; +} + +void bdi_start_writeback(struct writeback_control *wbc) +{ + const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; + struct bdi_work work_stack, *work = NULL; + + if (!must_wait) + work = bdi_alloc_work(wbc); + + if (!work) { + work = &work_stack; + bdi_work_init_on_stack(work, wbc); + } + + bdi_queue_work(wbc->bdi, work); + + /* + * If the sync mode is WB_SYNC_ALL, block waiting for the work to + * complete. If not, we only need to wait for the work to be started, + * if we allocated it on-stack. We use the same mechanism, if the + * wait bit is set in the bdi_work struct, then threads will not + * clear pending until after they are done. + * + * Note that work == &work_stack if must_wait is true, so we don't + * need to do call_rcu() here ever, since the completion path will + * have done that for us. + */ + if (must_wait || work == &work_stack) { + bdi_wait_on_work_clear(work); + if (work != &work_stack) + call_rcu(&work->rcu_head, bdi_work_free); + } } /* @@ -191,31 +243,32 @@ static int write_inode(struct inode *inode, int sync) * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is - * already the most-recently-dirtied inode on the s_dirty list. If that is + * already the most-recently-dirtied inode on the b_dirty list. If that is * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ static void redirty_tail(struct inode *inode) { - struct super_block *sb = inode->i_sb; + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - if (!list_empty(&sb->s_dirty)) { - struct inode *tail_inode; + if (!list_empty(&wb->b_dirty)) { + struct inode *tail; - tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); - if (time_before(inode->dirtied_when, - tail_inode->dirtied_when)) + tail = list_entry(wb->b_dirty.next, struct inode, i_list); + if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &sb->s_dirty); + list_move(&inode->i_list, &wb->b_dirty); } /* - * requeue inode for re-scanning after sb->s_io list is exhausted. + * requeue inode for re-scanning after bdi->b_io list is exhausted. */ static void requeue_io(struct inode *inode) { - list_move(&inode->i_list, &inode->i_sb->s_more_io); + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + + list_move(&inode->i_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -262,20 +315,18 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* * Queue all expired dirty inodes for io, eldest first. */ -static void queue_io(struct super_block *sb, - unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - list_splice_init(&sb->s_more_io, sb->s_io.prev); - move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); + list_splice_init(&wb->b_more_io, wb->b_io.prev); + move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -int sb_has_dirty_inodes(struct super_block *sb) +static int write_inode(struct inode *inode, int sync) { - return !list_empty(&sb->s_dirty) || - !list_empty(&sb->s_io) || - !list_empty(&sb->s_more_io); + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) + return inode->i_sb->s_op->write_inode(inode, sync); + return 0; } -EXPORT_SYMBOL(sb_has_dirty_inodes); /* * Wait for writeback on an inode to complete. @@ -322,11 +373,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (inode->i_state & I_SYNC) { /* * If this inode is locked for writeback and we are not doing - * writeback-for-data-integrity, move it to s_more_io so that + * writeback-for-data-integrity, move it to b_more_io so that * writeback can proceed with the other inodes on s_io. * * We'll have another go at writing back this inode when we - * completed a full scan of s_io. + * completed a full scan of b_io. */ if (!wait) { requeue_io(inode); @@ -371,11 +422,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. Redirty - * the inode; Move it from s_io onto s_more_io/s_dirty. + * the inode; Move it from b_io onto b_more_io/b_dirty. */ /* * akpm: if the caller was the kupdate function we put - * this inode at the head of s_dirty so it gets first + * this inode at the head of b_dirty so it gets first * consideration. Otherwise, move it to the tail, for * the reasons described there. I'm not really sure * how much sense this makes. Presumably I had a good @@ -385,7 +436,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->for_kupdate) { /* * For the kupdate function we move the inode - * to s_more_io so it will get more writeout as + * to b_more_io so it will get more writeout as * soon as the queue becomes uncongested. */ inode->i_state |= I_DIRTY_PAGES; @@ -434,50 +485,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Write out a superblock's list of dirty inodes. A wait will be performed - * upon no inodes, all inodes or the final one, depending upon sync_mode. - * - * If older_than_this is non-NULL, then only write out inodes which - * had their first dirtying at a time earlier than *older_than_this. - * - * If we're a pdflush thread, then implement pdflush collision avoidance - * against the entire list. + * For WB_SYNC_NONE writeback, the caller does not have the sb pinned + * before calling writeback. So make sure that we do pin it, so it doesn't + * go away while we are writing inodes from it. * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched. For other superblocks, - * assume that all inodes are backed by the same queue. - * - * FIXME: this linear search could get expensive with many fileystems. But - * how to fix? We need to go from an address_space to all inodes which share - * a queue with that address_space. (Easy: have a global "dirty superblocks" - * list). - * - * The inodes to be written are parked on sb->s_io. They are moved back onto - * sb->s_dirty as they are selected for writing. This way, none can be missed - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on inode_sync_wait. + * Returns 0 if the super was successfully pinned (or pinning wasn't needed), + * 1 if we failed. */ -void generic_sync_sb_inodes(struct super_block *sb, +static int pin_sb_for_writeback(struct writeback_control *wbc, + struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + /* + * Caller must already hold the ref for this + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + return 0; + } + + spin_lock(&sb_lock); + sb->s_count++; + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root) { + spin_unlock(&sb_lock); + return 0; + } + /* + * umounted, drop rwsem again and fall through to failure + */ + up_read(&sb->s_umount); + } + + sb->s_count--; + spin_unlock(&sb_lock); + return 1; +} + +static void unpin_sb_for_writeback(struct writeback_control *wbc, + struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (wbc->sync_mode == WB_SYNC_ALL) + return; + + up_read(&sb->s_umount); + put_super(sb); +} + +static void writeback_inodes_wb(struct bdi_writeback *wb, struct writeback_control *wbc) { + struct super_block *sb = wbc->sb; + const int is_blkdev_sb = sb_is_blkdev_sb(sb); const unsigned long start = jiffies; /* livelock avoidance */ - int sync = wbc->sync_mode == WB_SYNC_ALL; spin_lock(&inode_lock); - if (!wbc->for_kupdate || list_empty(&sb->s_io)) - queue_io(sb, wbc->older_than_this); - while (!list_empty(&sb->s_io)) { - struct inode *inode = list_entry(sb->s_io.prev, + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); - struct address_space *mapping = inode->i_mapping; - struct backing_dev_info *bdi = mapping->backing_dev_info; long pages_skipped; - if (!bdi_cap_writeback_dirty(bdi)) { + /* + * super block given and doesn't match, skip this inode + */ + if (sb && sb != inode->i_sb) { + redirty_tail(inode); + continue; + } + + if (!bdi_cap_writeback_dirty(wb->bdi)) { redirty_tail(inode); - if (sb_is_blkdev_sb(sb)) { + if (is_blkdev_sb) { /* * Dirty memory-backed blockdev: the ramdisk * driver does this. Skip just this inode @@ -497,21 +582,14 @@ void generic_sync_sb_inodes(struct super_block *sb, continue; } - if (wbc->nonblocking && bdi_write_congested(bdi)) { + if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { wbc->encountered_congestion = 1; - if (!sb_is_blkdev_sb(sb)) + if (!is_blkdev_sb) break; /* Skip a congested fs */ requeue_io(inode); continue; /* Skip a congested blockdev */ } - if (wbc->bdi && bdi != wbc->bdi) { - if (!sb_is_blkdev_sb(sb)) - break; /* fs has the wrong queue */ - requeue_io(inode); - continue; /* blockdev has wrong queue */ - } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -519,16 +597,16 @@ void generic_sync_sb_inodes(struct super_block *sb, if (inode_dirtied_after(inode, start)) break; - /* Is another pdflush already flushing this queue? */ - if (current_is_pdflush() && !writeback_acquire(bdi)) - break; + if (pin_sb_for_writeback(wbc, inode)) { + requeue_io(inode); + continue; + } BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); - if (current_is_pdflush()) - writeback_release(bdi); + unpin_sb_for_writeback(wbc, inode); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked @@ -544,144 +622,571 @@ void generic_sync_sb_inodes(struct super_block *sb, wbc->more_io = 1; break; } - if (!list_empty(&sb->s_more_io)) + if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } - if (sync) { - struct inode *inode, *old_inode = NULL; + spin_unlock(&inode_lock); + /* Leave any unwritten inodes on b_io */ +} + +void writeback_inodes_wbc(struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = wbc->bdi; + writeback_inodes_wb(&bdi->wb, wbc); +} + +/* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024 + +static inline bool over_bground_thresh(void) +{ + unsigned long background_thresh, dirty_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + + return (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) >= background_thresh); +} + +/* + * Explicit flushing or periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static long wb_writeback(struct bdi_writeback *wb, long nr_pages, + struct super_block *sb, + enum writeback_sync_modes sync_mode, int for_kupdate) +{ + struct writeback_control wbc = { + .bdi = wb->bdi, + .sb = sb, + .sync_mode = sync_mode, + .older_than_this = NULL, + .for_kupdate = for_kupdate, + .range_cyclic = 1, + }; + unsigned long oldest_jif; + long wrote = 0; + + if (wbc.for_kupdate) { + wbc.older_than_this = &oldest_jif; + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + } + + for (;;) { /* - * Data integrity sync. Must wait for all pages under writeback, - * because there may have been pages dirtied before our sync - * call, but which had writeout started before we write it out. - * In which case, the inode may not be on the dirty list, but - * we still have to wait for that writeout. + * Don't flush anything for non-integrity writeback where + * no nr_pages was given */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; + if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE) + break; - if (inode->i_state & - (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) + /* + * If no specific pages were given and this is just a + * periodic background writeout and we are below the + * background dirty threshold, don't do anything + */ + if (for_kupdate && nr_pages <= 0 && !over_bground_thresh()) + break; + + wbc.more_io = 0; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.pages_skipped = 0; + writeback_inodes_wb(wb, &wbc); + nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; + + /* + * If we ran out of stuff to write, bail unless more_io got set + */ + if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { + if (wbc.more_io && !wbc.for_kupdate) continue; - __iget(inode); - spin_unlock(&inode_lock); + break; + } + } + + return wrote; +} + +/* + * Return the next bdi_work struct that hasn't been processed by this + * wb thread yet + */ +static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct bdi_work *work, *ret = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(work, &bdi->work_list, list) { + if (!test_and_clear_bit(wb->nr, &work->seen)) + continue; + + ret = work; + break; + } + + rcu_read_unlock(); + return ret; +} + +static long wb_check_old_data_flush(struct bdi_writeback *wb) +{ + unsigned long expired; + long nr_pages; + + expired = wb->last_old_flush + + msecs_to_jiffies(dirty_writeback_interval * 10); + if (time_before(jiffies, expired)) + return 0; + + wb->last_old_flush = jiffies; + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + if (nr_pages) + return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1); + + return 0; +} + +/* + * Retrieve work items and do the writeback they describe + */ +long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +{ + struct backing_dev_info *bdi = wb->bdi; + struct bdi_work *work; + long nr_pages, wrote = 0; + + while ((work = get_next_work_item(bdi, wb)) != NULL) { + enum writeback_sync_modes sync_mode; + + nr_pages = work->nr_pages; + + /* + * Override sync mode, in case we must wait for completion + */ + if (force_wait) + work->sync_mode = sync_mode = WB_SYNC_ALL; + else + sync_mode = work->sync_mode; + + /* + * If this isn't a data integrity operation, just notify + * that we have seen this work and we are now starting it. + */ + if (sync_mode == WB_SYNC_NONE) + wb_clear_pending(wb, work); + + wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0); + + /* + * This is a data integrity writeback, so only do the + * notification when we have completed the work. + */ + if (sync_mode == WB_SYNC_ALL) + wb_clear_pending(wb, work); + } + + /* + * Check for periodic writeback, kupdated() style + */ + wrote += wb_check_old_data_flush(wb); + + return wrote; +} + +/* + * Handle writeback of dirty data for the device backed by this bdi. Also + * wakes up periodically and does kupdated style flushing. + */ +int bdi_writeback_task(struct bdi_writeback *wb) +{ + unsigned long last_active = jiffies; + unsigned long wait_jiffies = -1UL; + long pages_written; + + while (!kthread_should_stop()) { + pages_written = wb_do_writeback(wb, 0); + + if (pages_written) + last_active = jiffies; + else if (wait_jiffies != -1UL) { + unsigned long max_idle; + /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. + * Longest period of inactivity that we tolerate. If we + * see dirty data again later, the task will get + * recreated automatically. */ - iput(old_inode); - old_inode = inode; + max_idle = max(5UL * 60 * HZ, wait_jiffies); + if (time_after(jiffies, max_idle + last_active)) + break; + } - filemap_fdatawait(mapping); + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(wait_jiffies); + try_to_freeze(); + } - cond_resched(); + return 0; +} + +/* + * Schedule writeback for all backing devices. Expensive! If this is a data + * integrity operation, writeback will be complete when this returns. If + * we are simply called for WB_SYNC_NONE, then writeback will merely be + * scheduled to run. + */ +static void bdi_writeback_all(struct writeback_control *wbc) +{ + const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; + struct backing_dev_info *bdi; + struct bdi_work *work; + LIST_HEAD(list); + +restart: + spin_lock(&bdi_lock); + + list_for_each_entry(bdi, &bdi_list, bdi_list) { + struct bdi_work *work; - spin_lock(&inode_lock); + if (!bdi_has_dirty_io(bdi)) + continue; + + /* + * If work allocation fails, do the writes inline. We drop + * the lock and restart the list writeout. This should be OK, + * since this happens rarely and because the writeout should + * eventually make more free memory available. + */ + work = bdi_alloc_work(wbc); + if (!work) { + struct writeback_control __wbc; + + /* + * Not a data integrity writeout, just continue + */ + if (!must_wait) + continue; + + spin_unlock(&bdi_lock); + __wbc = *wbc; + __wbc.bdi = bdi; + writeback_inodes_wbc(&__wbc); + goto restart; } - spin_unlock(&inode_lock); - iput(old_inode); - } else - spin_unlock(&inode_lock); + if (must_wait) + list_add_tail(&work->wait_list, &list); + + bdi_queue_work(bdi, work); + } + + spin_unlock(&bdi_lock); - return; /* Leave any unwritten inodes on s_io */ + /* + * If this is for WB_SYNC_ALL, wait for pending work to complete + * before returning. + */ + while (!list_empty(&list)) { + work = list_entry(list.next, struct bdi_work, wait_list); + list_del(&work->wait_list); + bdi_wait_on_work_clear(work); + call_rcu(&work->rcu_head, bdi_work_free); + } } -EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); -static void sync_sb_inodes(struct super_block *sb, - struct writeback_control *wbc) +/* + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. + */ +void wakeup_flusher_threads(long nr_pages) { - generic_sync_sb_inodes(sb, wbc); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .range_cyclic = 1, + }; + + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + wbc.nr_to_write = nr_pages; + bdi_writeback_all(&wbc); } -/* - * Start writeback of dirty pagecache data against all unlocked inodes. +static noinline void block_dump___mark_inode_dirty(struct inode *inode) +{ + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + printk(KERN_DEBUG + "%s(%d): dirtied inode %lu (%s) on %s\n", + current->comm, task_pid_nr(current), inode->i_ino, + name, inode->i_sb->s_id); + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } +} + +/** + * __mark_inode_dirty - internal function + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. * - * Note: - * We don't need to grab a reference to superblock here. If it has non-empty - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed - * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all - * empty. Since __sync_single_inode() regains inode_lock before it finally moves - * inode from superblock lists we are OK. + * Put the inode on the super block's dirty list. + * + * CAREFUL! We mark it dirty unconditionally, but move it onto the + * dirty list only if it is hashed or if it refers to a blockdev. + * If it was not hashed, it will never be added to the dirty list + * even if it is later hashed, as it will have been marked dirty already. * - * If `older_than_this' is non-zero then only flush inodes which have a - * flushtime older than *older_than_this. + * In short, make sure you hash any inodes _before_ you start marking + * them dirty. * - * If `bdi' is non-zero then we will scan the first inode against each - * superblock until we find the matching ones. One group will be the dirty - * inodes against a filesystem. Then when we hit the dummy blockdev superblock, - * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not - * super-efficient but we're about to do a ton of I/O... + * This function *must* be atomic for the I_DIRTY_PAGES case - + * set_page_dirty() is called under spinlock in several places. + * + * Note that for blockdevs, inode->dirtied_when represents the dirtying time of + * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of + * the kernel-internal blockdev inode represents the dirtying time of the + * blockdev's pages. This is why for I_DIRTY_PAGES we always use + * page->mapping->host, so the page-dirtying time is recorded in the internal + * blockdev inode. */ -void -writeback_inodes(struct writeback_control *wbc) +void __mark_inode_dirty(struct inode *inode, int flags) { - struct super_block *sb; + struct super_block *sb = inode->i_sb; - might_sleep(); - spin_lock(&sb_lock); -restart: - list_for_each_entry_reverse(sb, &super_blocks, s_list) { - if (sb_has_dirty_inodes(sb)) { - /* we're making our own get_super here */ - sb->s_count++; - spin_unlock(&sb_lock); - /* - * If we can't get the readlock, there's no sense in - * waiting around, most of the time the FS is going to - * be unmounted by the time it is released. - */ - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) - sync_sb_inodes(sb, wbc); - up_read(&sb->s_umount); + /* + * Don't do this for I_DIRTY_PAGES - that doesn't actually + * dirty the inode itself + */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } + + /* + * make sure that changes are seen by all cpus before we test i_state + * -- mikulas + */ + smp_mb(); + + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + + if (unlikely(block_dump)) + block_dump___mark_inode_dirty(inode); + + spin_lock(&inode_lock); + if ((inode->i_state & flags) != flags) { + const int was_dirty = inode->i_state & I_DIRTY; + + inode->i_state |= flags; + + /* + * If the inode is being synced, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. + */ + if (inode->i_state & I_SYNC) + goto out; + + /* + * Only add valid (hashed) inodes to the superblock's + * dirty list. Add blockdev inodes as well. + */ + if (!S_ISBLK(inode->i_mode)) { + if (hlist_unhashed(&inode->i_hash)) + goto out; + } + if (inode->i_state & (I_FREEING|I_CLEAR)) + goto out; + + /* + * If the inode was already on b_dirty/b_io/b_more_io, don't + * reposition it (that would break b_dirty time-ordering). + */ + if (!was_dirty) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + struct backing_dev_info *bdi = wb->bdi; + + if (bdi_cap_writeback_dirty(bdi) && + !test_bit(BDI_registered, &bdi->state)) { + WARN_ON(1); + printk(KERN_ERR "bdi-%s not registered\n", + bdi->name); } - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; + + inode->dirtied_when = jiffies; + list_move(&inode->i_list, &wb->b_dirty); } - if (wbc->nr_to_write <= 0) - break; } - spin_unlock(&sb_lock); +out: + spin_unlock(&inode_lock); } +EXPORT_SYMBOL(__mark_inode_dirty); /* - * writeback and wait upon the filesystem's dirty inodes. The caller will - * do this in two passes - one to write, and one to wait. + * Write out a superblock's list of dirty inodes. A wait will be performed + * upon no inodes, all inodes or the final one, depending upon sync_mode. + * + * If older_than_this is non-NULL, then only write out inodes which + * had their first dirtying at a time earlier than *older_than_this. * - * A finite limit is set on the number of pages which will be written. - * To prevent infinite livelock of sys_sync(). + * If we're a pdlfush thread, then implement pdflush collision avoidance + * against the entire list. * - * We add in the number of potentially dirty inodes, because each inode write - * can dirty pagecache in the underlying blockdev. + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, + * assume that all inodes are backed by the same queue. + * + * The inodes to be written are parked on bdi->b_io. They are moved back onto + * bdi->b_dirty as they are selected for writing. This way, none can be missed + * on the writer throttling path, and we get decent balancing between many + * throttled threads: we don't want them all piling up on inode_sync_wait. */ -void sync_inodes_sb(struct super_block *sb, int wait) +static void wait_sb_inodes(struct writeback_control *wbc) +{ + struct inode *inode, *old_inode = NULL; + + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount)); + + spin_lock(&inode_lock); + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) { + struct address_space *mapping; + + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) + continue; + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * inode_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it + * under inode_lock. So we keep the reference and iput + * it later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_lock); + } + spin_unlock(&inode_lock); + iput(old_inode); +} + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. The number of pages submitted is + * returned. + */ +long writeback_inodes_sb(struct super_block *sb) { struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .sb = sb, + .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; - if (!wait) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - - wbc.nr_to_write = nr_dirty + nr_unstable + + nr_to_write = nr_dirty + nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - } else - wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ - sync_sb_inodes(sb, &wbc); + wbc.nr_to_write = nr_to_write; + bdi_writeback_all(&wbc); + return nr_to_write - wbc.nr_to_write; +} +EXPORT_SYMBOL(writeback_inodes_sb); + +/** + * sync_inodes_sb - sync sb inode pages + * @sb: the superblock + * + * This function writes and waits on any dirty inode belonging to this + * super_block. The number of pages synced is returned. + */ +long sync_inodes_sb(struct super_block *sb) +{ + struct writeback_control wbc = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .range_start = 0, + .range_end = LLONG_MAX, + }; + long nr_to_write = LONG_MAX; /* doesn't actually matter */ + + wbc.nr_to_write = nr_to_write; + bdi_writeback_all(&wbc); + wait_sb_inodes(&wbc); + return nr_to_write - wbc.nr_to_write; } +EXPORT_SYMBOL(sync_inodes_sb); /** * write_inode_now - write an inode to disk diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f91ccc4a189..4567db6f943 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -801,6 +801,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { int err; + fc->bdi.name = "fuse"; fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; /* fuse does it's own writeback accounting */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 941c8425c10..a93b885311d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; static struct backing_dev_info hugetlbfs_backing_dev_info = { + .name = "hugetlbfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; @@ -935,26 +936,28 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } -struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, + struct user_struct **user) { int error = -ENOMEM; - int unlock_shm = 0; struct file *file; struct inode *inode; struct dentry *dentry, *root; struct qstr quick_string; - struct user_struct *user = current_user(); + *user = NULL; if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); if (!can_do_hugetlb_shm()) { - if (user_shm_lock(size, user)) { - unlock_shm = 1; + *user = current_user(); + if (user_shm_lock(size, *user)) { WARN_ONCE(1, "Using mlock ulimits for SHM_HUGETLB deprecated\n"); - } else + } else { + *user = NULL; return ERR_PTR(-EPERM); + } } root = hugetlbfs_vfsmount->mnt_root; @@ -996,8 +999,10 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: - if (unlock_shm) - user_shm_unlock(size, user); + if (*user) { + user_shm_unlock(size, *user); + *user = NULL; + } return ERR_PTR(error); } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 8fcb6239218..7edb62e9741 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -static int jffs2_check_acl(struct inode *inode, int mask) +int jffs2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; int rc; @@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jffs2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jffs2_check_acl); -} - int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) { struct posix_acl *acl, *clone; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index fc929f2a14f..f0ba63e3c36 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,7 +26,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_permission(struct inode *, int); +extern int jffs2_check_acl(struct inode *, int); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); @@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler; #else -#define jffs2_permission (NULL) +#define jffs2_check_acl (NULL) #define jffs2_acl_chmod(inode) (0) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 6f60cc910f4..7aa4417e085 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 23c94753986..b7b74e29914 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index b7339c3b6ad..4ec11e8bda8 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index d9a721e6db7..5ef7bac265e 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) { if (!c->wbuf) return -ENOMEM; +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->wbuf); + return -ENOMEM; + } +#endif return 0; } void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif kfree(c->wbuf); } diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index a29c7c3e3fb..d66477c3430 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -114,7 +114,7 @@ out: return rc; } -static int jfs_check_acl(struct inode *inode, int mask) +int jfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); @@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jfs_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jfs_check_acl); -} - int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 7f6063acaa3..2b70fa78e4a 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 88475f10a38..b07bd417ef8 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_permission(struct inode *, int); +int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_setattr(struct dentry *, struct iattr *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 514ee2edb92..c79a4270f08 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/libfs.c b/fs/libfs.c index ddfa89948c3..dcec3d3ea64 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, return PTR_ERR(s); s->s_flags = MS_NOUSER; - s->s_maxbytes = ~0ULL; + s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 99d737bd432..7cb076ac6b4 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap) return hash & (NLM_HOST_NRHASH - 1); } -static void nlm_clear_port(struct sockaddr *sap) -{ - switch (sap->sa_family) { - case AF_INET: - ((struct sockaddr_in *)sap)->sin_port = 0; - break; - case AF_INET6: - ((struct sockaddr_in6 *)sap)->sin6_port = 0; - break; - } -} - /* * Common host lookup routine for server & client */ @@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) host->h_addrbuf = nsm->sm_addrbuf; memcpy(nlm_addr(host), ni->sap, ni->salen); host->h_addrlen = ni->salen; - nlm_clear_port(nlm_addr(host)); + rpc_set_port(nlm_addr(host), 0); memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); host->h_version = ni->version; host->h_proto = ni->protocol; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 7fce1b52584..30c933188dd 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) return (struct sockaddr *)&nsm->sm_addr; } -static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf, - const size_t len) -{ - const struct sockaddr_in *sin = (struct sockaddr_in *)sap; - snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); -} - -static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf, - const size_t len) -{ - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) - snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]); - else if (sin6->sin6_scope_id != 0) - snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr, - sin6->sin6_scope_id); - else - snprintf(buf, len, "%pI6", &sin6->sin6_addr); -} - -static void nsm_display_address(const struct sockaddr *sap, - char *buf, const size_t len) -{ - switch (sap->sa_family) { - case AF_INET: - nsm_display_ipv4_address(sap, buf, len); - break; - case AF_INET6: - nsm_display_ipv6_address(sap, buf, len); - break; - default: - snprintf(buf, len, "unsupported address family"); - break; - } -} - static struct rpc_clnt *nsm_create(void) { struct sockaddr_in sin = { @@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, memcpy(nsm_addr(new), sap, salen); new->sm_addrlen = salen; nsm_init_private(new); - nsm_display_address((const struct sockaddr *)&new->sm_addr, - new->sm_addrbuf, sizeof(new->sm_addrbuf)); + + if (rpc_ntop(nsm_addr(new), new->sm_addrbuf, + sizeof(new->sm_addrbuf)) == 0) + (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf), + "unsupported address family"); memcpy(new->sm_name, hostname, hostname_len); new->sm_name[hostname_len] = '\0'; diff --git a/fs/locks.c b/fs/locks.c index b6440f52178..19ee18a6829 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) * give it the opportunity to lock the file. */ if (found) - cond_resched_bkl(); + cond_resched(); find_conflict: for_each_lock(inode, before) { @@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (can_sleep) lock->fl_flags |= FL_SLEEP; - error = security_file_lock(filp, cmd); + error = security_file_lock(filp, lock->fl_type); if (error) goto out_free; diff --git a/fs/namei.c b/fs/namei.c index f3c5b278895..d11f404667e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -169,19 +169,10 @@ void putname(const char *name) EXPORT_SYMBOL(putname); #endif - -/** - * generic_permission - check for access rights on a Posix-like filesystem - * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * @check_acl: optional callback to check for Posix ACLs - * - * Used to check for read/write/execute permissions on a file. - * We use "fsuid" for this, letting us set arbitrary permissions - * for filesystem access without changing the "normal" uids which - * are used for other things.. +/* + * This does basic POSIX ACL permission checking */ -int generic_permission(struct inode *inode, int mask, +static int acl_permission_check(struct inode *inode, int mask, int (*check_acl)(struct inode *inode, int mask)) { umode_t mode = inode->i_mode; @@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask, else { if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { int error = check_acl(inode, mask); - if (error == -EACCES) - goto check_capabilities; - else if (error != -EAGAIN) + if (error != -EAGAIN) return error; } @@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask, */ if ((mask & ~mode) == 0) return 0; + return -EACCES; +} + +/** + * generic_permission - check for access rights on a Posix-like filesystem + * @inode: inode to check access rights for + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @check_acl: optional callback to check for Posix ACLs + * + * Used to check for read/write/execute permissions on a file. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things.. + */ +int generic_permission(struct inode *inode, int mask, + int (*check_acl)(struct inode *inode, int mask)) +{ + int ret; + + /* + * Do the basic POSIX ACL permission checks. + */ + ret = acl_permission_check(inode, mask, check_acl); + if (ret != -EACCES) + return ret; - check_capabilities: /* * Read/write DACs are always overridable. * Executable DACs are overridable if at least one exec bit is set. @@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask) if (inode->i_op->permission) retval = inode->i_op->permission(inode, mask); else - retval = generic_permission(inode, mask, NULL); + retval = generic_permission(inode, mask, inode->i_op->check_acl); if (retval) return retval; @@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, */ static int exec_permission_lite(struct inode *inode) { - umode_t mode = inode->i_mode; + int ret; - if (inode->i_op->permission) - return -EAGAIN; - - if (current_fsuid() == inode->i_uid) - mode >>= 6; - else if (in_group_p(inode->i_gid)) - mode >>= 3; - - if (mode & MAY_EXEC) - goto ok; - - if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE)) - goto ok; - - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE)) + if (inode->i_op->permission) { + ret = inode->i_op->permission(inode, MAY_EXEC); + if (!ret) + goto ok; + return ret; + } + ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); + if (!ret) goto ok; - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) + if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) goto ok; - return -EACCES; + return ret; ok: return security_inode_permission(inode, MAY_EXEC); } @@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); - if (err == -EAGAIN) - err = inode_permission(nd->path.dentry->d_inode, - MAY_EXEC); - if (!err) - err = ima_path_check(&nd->path, MAY_EXEC, - IMA_COUNT_UPDATE); if (err) break; @@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag) if (error) return error; - error = ima_path_check(path, - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), + error = ima_path_check(path, acc_mode ? + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : + ACC_MODE(flag) & (MAY_READ | MAY_WRITE), IMA_COUNT_UPDATE); + if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { + error = -EPERM; if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) - return -EPERM; + goto err_out; if (flag & O_TRUNC) - return -EPERM; + goto err_out; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME) - if (!is_owner_or_cap(inode)) - return -EPERM; + if (!is_owner_or_cap(inode)) { + error = -EPERM; + goto err_out; + } /* * Ensure there are no outstanding leases on the file. */ error = break_lease(inode, flag); if (error) - return error; + goto err_out; if (flag & O_TRUNC) { error = get_write_access(inode); if (error) - return error; + goto err_out; /* * Refuse to truncate files with mandatory locks held on them. @@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag) } put_write_access(inode); if (error) - return error; + goto err_out; } else if (flag & FMODE_WRITE) vfs_dq_init(inode); return 0; +err_out: + ima_counts_put(path, acc_mode ? + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : + ACC_MODE(flag) & (MAY_READ | MAY_WRITE)); + return error; } /* diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 845159814de..da7fda639ea 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ direct.o pagelist.o proc.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o + write.o namespace.o mount_clnt.o \ + dns_resolve.o cache_lib.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c new file mode 100644 index 00000000000..b4ffd0146ea --- /dev/null +++ b/fs/nfs/cache_lib.c @@ -0,0 +1,140 @@ +/* + * linux/fs/nfs/cache_lib.c + * + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "cache_lib.h" + +#define NFS_CACHE_UPCALL_PATHLEN 256 +#define NFS_CACHE_UPCALL_TIMEOUT 15 + +static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] = + "/sbin/nfs_cache_getent"; +static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT; + +module_param_string(cache_getent, nfs_cache_getent_prog, + sizeof(nfs_cache_getent_prog), 0600); +MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program"); +module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600); +MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which " + "the cache upcall is assumed to have failed"); + +int nfs_cache_upcall(struct cache_detail *cd, char *entry_name) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[] = { + nfs_cache_getent_prog, + cd->name, + entry_name, + NULL + }; + int ret = -EACCES; + + if (nfs_cache_getent_prog[0] == '\0') + goto out; + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the 'cache_getent' parameter once the problem + * has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) + nfs_cache_getent_prog[0] = '\0'; +out: + return ret > 0 ? 0 : ret; +} + +/* + * Deferred request handling + */ +void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) +{ + if (atomic_dec_and_test(&dreq->count)) + kfree(dreq); +} + +static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); + + complete_all(&dreq->completion); + nfs_cache_defer_req_put(dreq); +} + +static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(req, struct nfs_cache_defer_req, req); + dreq->deferred_req.revisit = nfs_dns_cache_revisit; + atomic_inc(&dreq->count); + + return &dreq->deferred_req; +} + +struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) +{ + struct nfs_cache_defer_req *dreq; + + dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); + if (dreq) { + init_completion(&dreq->completion); + atomic_set(&dreq->count, 1); + dreq->req.defer = nfs_dns_cache_defer; + } + return dreq; +} + +int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq) +{ + if (wait_for_completion_timeout(&dreq->completion, + nfs_cache_getent_timeout * HZ) == 0) + return -ETIMEDOUT; + return 0; +} + +int nfs_cache_register(struct cache_detail *cd) +{ + struct nameidata nd; + struct vfsmount *mnt; + int ret; + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd); + if (ret) + goto err; + ret = sunrpc_cache_register_pipefs(nd.path.dentry, + cd->name, 0600, cd); + path_put(&nd.path); + if (!ret) + return ret; +err: + rpc_put_mount(); + return ret; +} + +void nfs_cache_unregister(struct cache_detail *cd) +{ + sunrpc_cache_unregister_pipefs(cd); + rpc_put_mount(); +} + diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h new file mode 100644 index 00000000000..76f856e284e --- /dev/null +++ b/fs/nfs/cache_lib.h @@ -0,0 +1,27 @@ +/* + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ + +#include <linux/completion.h> +#include <linux/sunrpc/cache.h> +#include <asm/atomic.h> + +/* + * Deferred request handling + */ +struct nfs_cache_defer_req { + struct cache_req req; + struct cache_deferred_req deferred_req; + struct completion completion; + atomic_t count; +}; + +extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); +extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); +extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); +extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); + +extern int nfs_cache_register(struct cache_detail *cd); +extern void nfs_cache_unregister(struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 7f604c7941f..293fa0528a6 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program; unsigned int nfs_callback_set_tcpport; unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; +#define NFS_CALLBACK_MAXPORTNR (65535U) -static int param_set_port(const char *val, struct kernel_param *kp) +static int param_set_portnr(const char *val, struct kernel_param *kp) { - char *endp; - int num = simple_strtol(val, &endp, 0); - if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) + unsigned long num; + int ret; + + if (!val) + return -EINVAL; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) return -EINVAL; - *((int *)kp->arg) = num; + *((unsigned int *)kp->arg) = num; return 0; } -module_param_call(callback_tcpport, param_set_port, param_get_int, - &nfs_callback_set_tcpport, 0644); +static int param_get_portnr(char *buffer, struct kernel_param *kp) +{ + return param_get_uint(buffer, kp); +} +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); /* * This is the NFSv4 callback kernel thread. diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8d25ccb2d51..e350bd6a233 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; server->options = data->options; + server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -879,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; if (server->wsize > max_rpc_payload) @@ -1074,10 +1078,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - spin_lock(&nfs_client_lock); list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); @@ -1274,7 +1274,7 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; server->options = data->options; /* Get a client record */ @@ -1359,10 +1359,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - spin_lock(&nfs_client_lock); list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); @@ -1400,7 +1396,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; /* Get a client representation. * Note: NFSv4 always uses TCP, */ diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e4e089a8f29..6c3210099d5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -934,9 +934,6 @@ out: * back into its cache. We let the server do generic write * parameter checking and report problems. * - * We also avoid an unnecessary invocation of generic_osync_inode(), - * as it is fairly meaningless to sync the metadata of an NFS file. - * * We eliminate local atime updates, see direct read above. * * We avoid unnecessary page cache invalidations for normal cached diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c new file mode 100644 index 00000000000..f4d54ba97cc --- /dev/null +++ b/fs/nfs/dns_resolve.c @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/dns_resolve.c + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + * + * Resolves DNS hostnames into valid ip addresses + */ + +#include <linux/hash.h> +#include <linux/string.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/seq_file.h> +#include <linux/inet.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/svcauth.h> + +#include "dns_resolve.h" +#include "cache_lib.h" + +#define NFS_DNS_HASHBITS 4 +#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) + +static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE]; + +struct nfs_dns_ent { + struct cache_head h; + + char *hostname; + size_t namelen; + + struct sockaddr_storage addr; + size_t addrlen; +}; + + +static void nfs_dns_ent_init(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + kfree(new->hostname); + new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); + if (new->hostname) { + new->namelen = key->namelen; + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; + } else { + new->namelen = 0; + new->addrlen = 0; + } +} + +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + kfree(item->hostname); + kfree(item); +} + +static struct cache_head *nfs_dns_ent_alloc(void) +{ + struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); + + if (item != NULL) { + item->hostname = NULL; + item->namelen = 0; + item->addrlen = 0; + return &item->h; + } + return NULL; +}; + +static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key) +{ + return hash_str(key->hostname, NFS_DNS_HASHBITS); +} + +static void nfs_dns_request(struct cache_detail *cd, + struct cache_head *ch, + char **bpp, int *blen) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + + qword_add(bpp, blen, key->hostname); + (*bpp)[-1] = '\n'; +} + +static int nfs_dns_upcall(struct cache_detail *cd, + struct cache_head *ch) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + int ret; + + ret = nfs_cache_upcall(cd, key->hostname); + if (ret) + ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); + return ret; +} + +static int nfs_dns_match(struct cache_head *ca, + struct cache_head *cb) +{ + struct nfs_dns_ent *a; + struct nfs_dns_ent *b; + + a = container_of(ca, struct nfs_dns_ent, h); + b = container_of(cb, struct nfs_dns_ent, h); + + if (a->namelen == 0 || a->namelen != b->namelen) + return 0; + return memcmp(a->hostname, b->hostname, a->namelen) == 0; +} + +static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, + struct cache_head *h) +{ + struct nfs_dns_ent *item; + long ttl; + + if (h == NULL) { + seq_puts(m, "# ip address hostname ttl\n"); + return 0; + } + item = container_of(h, struct nfs_dns_ent, h); + ttl = (long)item->h.expiry_time - (long)get_seconds(); + if (ttl < 0) + ttl = 0; + + if (!test_bit(CACHE_NEGATIVE, &h->flags)) { + char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1]; + + rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf)); + seq_printf(m, "%15s ", buf); + } else + seq_puts(m, "<none> "); + seq_printf(m, "%15s %ld\n", item->hostname, ttl); + return 0; +} + +struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_lookup(cd, + &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, + struct nfs_dns_ent *new, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_update(cd, + &new->h, &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) +{ + char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; + struct nfs_dns_ent key, *item; + unsigned long ttl; + ssize_t len; + int ret = -EINVAL; + + if (buf[buflen-1] != '\n') + goto out; + buf[buflen-1] = '\0'; + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + key.addrlen = rpc_pton(buf1, len, + (struct sockaddr *)&key.addr, + sizeof(key.addr)); + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + + key.hostname = buf1; + key.namelen = len; + memset(&key.h, 0, sizeof(key.h)); + + ttl = get_expiry(&buf); + if (ttl == 0) + goto out; + key.h.expiry_time = ttl + get_seconds(); + + ret = -ENOMEM; + item = nfs_dns_lookup(cd, &key); + if (item == NULL) + goto out; + + if (key.addrlen == 0) + set_bit(CACHE_NEGATIVE, &key.h.flags); + + item = nfs_dns_update(cd, &key, item); + if (item == NULL) + goto out; + + ret = 0; + cache_put(&item->h, cd); +out: + return ret; +} + +static struct cache_detail nfs_dns_resolve = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .hash_table = nfs_dns_table, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_init, + .alloc = nfs_dns_ent_alloc, +}; + +static int do_cache_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item, + struct nfs_cache_defer_req *dreq) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (*item) { + ret = cache_check(cd, &(*item)->h, &dreq->req); + if (ret) + *item = NULL; + } + return ret; +} + +static int do_cache_lookup_nowait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (!*item) + goto out_err; + ret = -ETIMEDOUT; + if (!test_bit(CACHE_VALID, &(*item)->h.flags) + || (*item)->h.expiry_time < get_seconds() + || cd->flush_time > (*item)->h.last_refresh) + goto out_put; + ret = -ENOENT; + if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags)) + goto out_put; + return 0; +out_put: + cache_put(&(*item)->h, cd); +out_err: + *item = NULL; + return ret; +} + +static int do_cache_lookup_wait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + struct nfs_cache_defer_req *dreq; + int ret = -ENOMEM; + + dreq = nfs_cache_defer_req_alloc(); + if (!dreq) + goto out; + ret = do_cache_lookup(cd, key, item, dreq); + if (ret == -EAGAIN) { + ret = nfs_cache_wait_for_upcall(dreq); + if (!ret) + ret = do_cache_lookup_nowait(cd, key, item); + } + nfs_cache_defer_req_put(dreq); +out: + return ret; +} + +ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + struct nfs_dns_ent key = { + .hostname = name, + .namelen = namelen, + }; + struct nfs_dns_ent *item = NULL; + ssize_t ret; + + ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); + if (ret == 0) { + if (salen >= item->addrlen) { + memcpy(sa, &item->addr, item->addrlen); + ret = item->addrlen; + } else + ret = -EOVERFLOW; + cache_put(&item->h, &nfs_dns_resolve); + } else if (ret == -ENOENT) + ret = -ESRCH; + return ret; +} + +int nfs_dns_resolver_init(void) +{ + return nfs_cache_register(&nfs_dns_resolve); +} + +void nfs_dns_resolver_destroy(void) +{ + nfs_cache_unregister(&nfs_dns_resolve); +} + diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h new file mode 100644 index 00000000000..a3f0938babf --- /dev/null +++ b/fs/nfs/dns_resolve.h @@ -0,0 +1,14 @@ +/* + * Resolve DNS hostnames into valid ip addresses + */ +#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H +#define __LINUX_FS_NFS_DNS_RESOLVE_H + +#define NFS_DNS_HOSTNAME_MAXLEN (128) + +extern int nfs_dns_resolver_init(void); +extern void nfs_dns_resolver_destroy(void); +extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen); + +#endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 05062329b67..5021b75d2d1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) } /* + * Decide whether a read/modify/write cycle may be more efficient + * then a modify/write/read cycle when writing to a page in the + * page cache. + * + * The modify/write/read cycle may occur if a page is read before + * being completely filled by the writer. In this situation, the + * page must be completely written to stable storage on the server + * before it can be refilled by reading in the page from the server. + * This can lead to expensive, small, FILE_SYNC mode writes being + * done. + * + * It may be more efficient to read the page first if the file is + * open for reading in addition to writing, the page is not marked + * as Uptodate, it is not dirty or waiting to be committed, + * indicating that it was previously allocated and then modified, + * that there were valid bytes of data in that range of the file, + * and that the new data won't completely replace the old data in + * that range of the file. + */ +static int nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned len) +{ + unsigned int pglen = nfs_page_length(page); + unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int end = offset + len; + + if ((file->f_mode & FMODE_READ) && /* open for read? */ + !PageUptodate(page) && /* Uptodate? */ + !PagePrivate(page) && /* i/o request already? */ + pglen && /* valid bytes of file? */ + (end < pglen || offset)) /* replace all valid bytes? */ + return 1; + return 0; +} + +/* * This does the "real" work of the write. We must allocate and lock the * page to be sent back to the generic routine, which then copies the * data from user space. @@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret; - pgoff_t index; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct page *page; - index = pos >> PAGE_CACHE_SHIFT; + int once_thru = 0; dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); +start: /* * Prevent starvation issues if someone is doing a consistency * sync-to-disk @@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, if (ret) { unlock_page(page); page_cache_release(page); + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; + ret = nfs_readpage(file, page); + page_cache_release(page); + if (!ret) + goto start; } return ret; } @@ -479,6 +523,7 @@ const struct address_space_operations nfs_file_aops = { .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, .direct_IO = nfs_direct_IO, + .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, }; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 86147b0ab2c..21a84d45916 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static unsigned int fnvhash32(const void *, size_t); -static struct rpc_pipe_ops idmap_upcall_ops = { +static const struct rpc_pipe_ops idmap_upcall_ops = { .upcall = idmap_pipe_upcall, .downcall = idmap_pipe_downcall, .destroy_msg = idmap_pipe_destroy_msg, @@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp) if (idmap == NULL) return -ENOMEM; - idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", - idmap, &idmap_upcall_ops, 0); + idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, + "idmap", idmap, &idmap_upcall_ops, 0); if (IS_ERR(idmap->idmap_dentry)) { error = PTR_ERR(idmap->idmap_dentry); kfree(idmap); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bd7938eda6a..060022b4651 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -46,6 +46,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; + if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 + && nfs_server_capable(inode, NFS_CAP_MODE)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; + else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; + else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; + else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_CHANGE) nfsi->change_attr = fattr->change_attr; + else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); + else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE; if (fattr->valid & NFS_ATTR_FATTR_NLINK) inode->i_nlink = fattr->nlink; + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -1145,6 +1177,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) loff_t cur_isize, new_isize; unsigned long invalid = 0; unsigned long now = jiffies; + unsigned long save_cache_validity; dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1171,10 +1204,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfsi->read_cache_jiffies = fattr->time_start; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); + save_cache_validity = nfsi->cache_validity; + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED + | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); @@ -1189,7 +1223,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); nfsi->change_attr = fattr->change_attr; } - } + } else if (server->caps & NFS_CAP_CHANGE_ATTR) + invalid |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { /* NFSv2/v3: Check if the mtime agrees */ @@ -1201,7 +1236,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } - } + } else if (server->caps & NFS_CAP_MTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { @@ -1215,7 +1255,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } - } + } else if (server->caps & NFS_CAP_CTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1231,30 +1275,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: isize change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } - } + } else + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + else if (server->caps & NFS_CAP_ATIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_mode = fattr->mode; } - } + } else if (server->caps & NFS_CAP_MODE) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (inode->i_uid != fattr->uid) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } - } + } else if (server->caps & NFS_CAP_OWNER) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (inode->i_gid != fattr->gid) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } - } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { @@ -1263,7 +1327,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_DATA; inode->i_nlink = fattr->nlink; } - } + } else if (server->caps & NFS_CAP_NLINK) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* @@ -1293,9 +1359,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; if (!nfs_have_delegation(inode, FMODE_READ) || - (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) + (save_cache_validity & NFS_INO_REVAL_FORCED)) nfsi->cache_validity |= invalid; - nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED; return 0; out_changed: @@ -1442,6 +1507,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_dns_resolver_init(); + if (err < 0) + goto out8; + err = nfs_fscache_register(); if (err < 0) goto out7; @@ -1500,6 +1569,8 @@ out5: out6: nfs_fscache_unregister(); out7: + nfs_dns_resolver_destroy(); +out8: return err; } @@ -1511,6 +1582,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); nfs_fscache_unregister(); + nfs_dns_resolver_destroy(); #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7dd90a6769d..e21b1bb9972 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -49,6 +49,11 @@ struct nfs_clone_mount { #define NFS_MAX_SECFLAVORS (12) /* + * Value used if the user did not specify a port value. + */ +#define NFS_UNSPEC_PORT (-1) + +/* * In-kernel mount arguments */ struct nfs_parsed_mount_data { @@ -63,6 +68,7 @@ struct nfs_parsed_mount_data { unsigned int auth_flavor_len; rpc_authflavor_t auth_flavors[1]; char *client_address; + unsigned int version; unsigned int minorversion; char *fscache_uniq; @@ -71,7 +77,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; u32 version; - unsigned short port; + int port; unsigned short protocol; } mount_server; @@ -80,7 +86,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; char *export_path; - unsigned short port; + int port; unsigned short protocol; } nfs_server; @@ -102,6 +108,7 @@ struct nfs_mount_request { }; extern int nfs_mount(struct nfs_mount_request *info); +extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern struct rpc_program nfs_program; @@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode); extern int nfs_wait_bit_killable(void *word); /* super.c */ -void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 extern struct file_system_type nfs4_xdev_fs_type; @@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +#ifdef CONFIG_MIGRATION +extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +#else +#define nfs_migrate_page NULL +#endif /* nfs4proc.c */ extern int _nfs4_call_sync(struct nfs_server *server, @@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } - -#define IPV6_SCOPE_DELIMITER '%' - -/* - * Set the port number in an address. Be agnostic about the address - * family. - */ -static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) -{ - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap; - - switch (sap->sa_family) { - case AF_INET: - ap->sin_port = htons(port); - break; - case AF_INET6: - ap6->sin6_port = htons(port); - break; - } -} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 38ef9eaec40..0adefc40cc8 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -209,6 +209,71 @@ out_mnt_err: goto out; } +/** + * nfs_umount - Notify a server that we have unmounted this export + * @info: pointer to umount request arguments + * + * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always + * use UDP. + */ +void nfs_umount(const struct nfs_mount_request *info) +{ + static const struct rpc_timeout nfs_umnt_timeout = { + .to_initval = 1 * HZ, + .to_maxval = 3 * HZ, + .to_retries = 2, + }; + struct rpc_create_args args = { + .protocol = IPPROTO_UDP, + .address = info->sap, + .addrsize = info->salen, + .timeout = &nfs_umnt_timeout, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + struct mountres result; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + .rpc_resp = &result, + }; + struct rpc_clnt *clnt; + int status; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + clnt = rpc_create(&args); + if (unlikely(IS_ERR(clnt))) + goto out_clnt_err; + + dprintk("NFS: sending UMNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), info->dirpath); + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; + else + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; + + status = rpc_call_sync(clnt, &msg, 0); + rpc_shutdown_client(clnt); + + if (unlikely(status < 0)) + goto out_call_err; + + return; + +out_clnt_err: + dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", + PTR_ERR(clnt)); + return; + +out_call_err: + dprintk("NFS: UMNT request failed, status=%d\n", status); +} + /* * XDR encode/decode functions for MOUNT */ @@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res) return -EIO; status = ntohl(*p); - for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) { + for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { if (mnt_errtbl[i].status == status) { res->errno = mnt_errtbl[i].errno; return 0; @@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) return -EIO; status = ntohl(*p); - for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) { + for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { if (mnt3_errtbl[i].status == status) { res->errno = mnt3_errtbl[i].errno; return 0; @@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = { .p_statidx = MOUNTPROC_MNT, .p_name = "MOUNT", }, + [MOUNTPROC_UMNT] = { + .p_proc = MOUNTPROC_UMNT, + .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC_UMNT, + .p_name = "UMOUNT", + }, }; static struct rpc_procinfo mnt3_procedures[] = { @@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = { .p_statidx = MOUNTPROC3_MNT, .p_name = "MOUNT", }, + [MOUNTPROC3_UMNT] = { + .p_proc = MOUNTPROC3_UMNT, + .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC3_UMNT, + .p_name = "UMOUNT", + }, }; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d0cc5ce0edf..ee6a13f0544 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -299,7 +299,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data) /* * Create a regular file. - * For now, we don't implement O_EXCL. */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2a2a0a7143a..2636c26d56f 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -17,6 +17,7 @@ #include <linux/inet.h> #include "internal.h" #include "nfs4_fs.h" +#include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, return 0; } +static size_t nfs_parse_server_name(char *string, size_t len, + struct sockaddr *sa, size_t salen) +{ + ssize_t ret; + + ret = rpc_pton(string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(string, len, sa, salen); + if (ret < 0) + ret = 0; + } + return ret; +} + static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, char *page, char *page2, const struct nfs4_fs_location *location) @@ -121,11 +136,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - nfs_parse_ip_address(buf->data, buf->len, - mountdata->addr, &mountdata->addrlen); - if (mountdata->addr->sa_family == AF_UNSPEC) + mountdata->addrlen = nfs_parse_server_name(buf->data, + buf->len, + mountdata->addr, mountdata->addrlen); + if (mountdata->addrlen == 0) continue; - nfs_set_port(mountdata->addr, NFS_PORT); + rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); page2[buf->len] = '\0'; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6917311f201..be6544aef41 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -61,6 +61,8 @@ #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) +#define NFS4_MAX_LOOP_ON_RECOVER (10) + struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); @@ -426,17 +428,19 @@ out: static int nfs4_recover_session(struct nfs4_session *session) { struct nfs_client *clp = session->clp; + unsigned int loop; int ret; - for (;;) { + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { ret = nfs4_wait_clnt_recover(clp); if (ret != 0) - return ret; + break; if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) break; nfs4_schedule_state_manager(clp); + ret = -EIO; } - return 0; + return ret; } static int nfs41_setup_sequence(struct nfs4_session *session, @@ -1444,18 +1448,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) static int nfs4_recover_expired_lease(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; + unsigned int loop; int ret; - for (;;) { + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { ret = nfs4_wait_clnt_recover(clp); if (ret != 0) - return ret; + break; if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) break; nfs4_schedule_state_recovery(clp); + ret = -EIO; } - return 0; + return ret; } /* @@ -1997,12 +2003,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f status = nfs4_call_sync(server, &msg, &args, &res, 0); if (status == 0) { memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); + server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| + NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| + NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| + NFS_CAP_CTIME|NFS_CAP_MTIME); if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) server->caps |= NFS_CAP_ACLS; if (res.has_links != 0) server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID) + server->caps |= NFS_CAP_FILEID; + if (res.attr_bitmask[1] & FATTR4_WORD1_MODE) + server->caps |= NFS_CAP_MODE; + if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS) + server->caps |= NFS_CAP_NLINK; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER) + server->caps |= NFS_CAP_OWNER; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP) + server->caps |= NFS_CAP_OWNER_GROUP; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS) + server->caps |= NFS_CAP_ATIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA) + server->caps |= NFS_CAP_CTIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) + server->caps |= NFS_CAP_MTIME; + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 65ca8c18476..1434080aefe 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1250,8 +1250,8 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; } /* Initialize or reset the session */ - if (nfs4_has_session(clp) && - test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) { + if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state) + && nfs4_has_session(clp)) { if (clp->cl_cons_state == NFS_CS_SESSION_INITING) status = nfs4_initialize_session(clp); else diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 617273e7d47..cfc30d362f9 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -702,29 +702,12 @@ struct compound_hdr { u32 minorversion; }; -/* - * START OF "GENERIC" ENCODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define WRITE32(n) *p++ = htonl(n) -#define WRITE64(n) do { \ - *p++ = htonl((uint32_t)((n) >> 32)); \ - *p++ = htonl((uint32_t)(n)); \ -} while (0) -#define WRITEMEM(ptr,nbytes) do { \ - p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ -} while (0) - -#define RESERVE_SPACE(nbytes) do { \ - p = xdr_reserve_space(xdr, nbytes); \ - BUG_ON(!p); \ -} while (0) +static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr_reserve_space(xdr, nbytes); + BUG_ON(!p); + return p; +} static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { @@ -749,12 +732,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr, dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); - WRITE32(hdr->taglen); - WRITEMEM(hdr->tag, hdr->taglen); - WRITE32(hdr->minorversion); + p = reserve_space(xdr, 4 + hdr->taglen + 8); + p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); + *p++ = cpu_to_be32(hdr->minorversion); hdr->nops_p = p; - WRITE32(hdr->nops); + *p = cpu_to_be32(hdr->nops); } static void encode_nops(struct compound_hdr *hdr) @@ -829,55 +811,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 16; else if (iap->ia_valid & ATTR_MTIME) len += 4; - RESERVE_SPACE(len); + p = reserve_space(xdr, len); /* * We write the bitmap length now, but leave the bitmap and the attribute * buffer length to be backfilled at the end of this routine. */ - WRITE32(2); + *p++ = cpu_to_be32(2); q = p; p += 3; if (iap->ia_valid & ATTR_SIZE) { bmval0 |= FATTR4_WORD0_SIZE; - WRITE64(iap->ia_size); + p = xdr_encode_hyper(p, iap->ia_size); } if (iap->ia_valid & ATTR_MODE) { bmval1 |= FATTR4_WORD1_MODE; - WRITE32(iap->ia_mode & S_IALLUGO); + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); } if (iap->ia_valid & ATTR_UID) { bmval1 |= FATTR4_WORD1_OWNER; - WRITE32(owner_namelen); - WRITEMEM(owner_name, owner_namelen); + p = xdr_encode_opaque(p, owner_name, owner_namelen); } if (iap->ia_valid & ATTR_GID) { bmval1 |= FATTR4_WORD1_OWNER_GROUP; - WRITE32(owner_grouplen); - WRITEMEM(owner_group, owner_grouplen); + p = xdr_encode_opaque(p, owner_group, owner_grouplen); } if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_ATIME) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (iap->ia_valid & ATTR_MTIME_SET) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_MTIME) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } /* @@ -891,7 +871,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len = (char *)p - (char *)q - 12; *q++ = htonl(bmval0); *q++ = htonl(bmval1); - *q++ = htonl(len); + *q = htonl(len); /* out: */ } @@ -900,9 +880,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd { __be32 *p; - RESERVE_SPACE(8); - WRITE32(OP_ACCESS); - WRITE32(access); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_ACCESS); + *p = cpu_to_be32(access); hdr->nops++; hdr->replen += decode_access_maxsz; } @@ -911,10 +891,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg { __be32 *p; - RESERVE_SPACE(8+NFS4_STATEID_SIZE); - WRITE32(OP_CLOSE); - WRITE32(arg->seqid->sequence->counter); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); + xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_close_maxsz; } @@ -923,10 +903,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar { __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_COMMIT); - WRITE64(args->offset); - WRITE32(args->count); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_COMMIT); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_commit_maxsz; } @@ -935,30 +915,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * { __be32 *p; - RESERVE_SPACE(8); - WRITE32(OP_CREATE); - WRITE32(create->ftype); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_CREATE); + *p = cpu_to_be32(create->ftype); switch (create->ftype) { case NF4LNK: - RESERVE_SPACE(4); - WRITE32(create->u.symlink.len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(create->u.symlink.len); xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); break; case NF4BLK: case NF4CHR: - RESERVE_SPACE(8); - WRITE32(create->u.device.specdata1); - WRITE32(create->u.device.specdata2); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(create->u.device.specdata1); + *p = cpu_to_be32(create->u.device.specdata2); break; default: break; } - RESERVE_SPACE(4 + create->name->len); - WRITE32(create->name->len); - WRITEMEM(create->name->name, create->name->len); + encode_string(xdr, create->name->len, create->name->name); hdr->nops++; hdr->replen += decode_create_maxsz; @@ -969,10 +947,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c { __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_GETATTR); - WRITE32(1); - WRITE32(bitmap); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bitmap); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -981,11 +959,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm { __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_GETATTR); - WRITE32(2); - WRITE32(bm0); - WRITE32(bm1); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -1012,8 +990,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_GETFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_GETFH); hdr->nops++; hdr->replen += decode_getfh_maxsz; } @@ -1022,10 +1000,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct { __be32 *p; - RESERVE_SPACE(8 + name->len); - WRITE32(OP_LINK); - WRITE32(name->len); - WRITEMEM(name->name, name->len); + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_LINK); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_link_maxsz; } @@ -1052,27 +1029,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args { __be32 *p; - RESERVE_SPACE(32); - WRITE32(OP_LOCK); - WRITE32(nfs4_lock_type(args->fl, args->block)); - WRITE32(args->reclaim); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE32(args->new_lock_owner); + p = reserve_space(xdr, 32); + *p++ = cpu_to_be32(OP_LOCK); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); + *p++ = cpu_to_be32(args->reclaim); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); - WRITE32(args->open_seqid->sequence->counter); - WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); + p = xdr_encode_hyper(p, args->lock_owner.clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, args->lock_owner.id); } else { - RESERVE_SPACE(NFS4_STATEID_SIZE+4); - WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); + p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); } hdr->nops++; hdr->replen += decode_lock_maxsz; @@ -1082,15 +1059,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar { __be32 *p; - RESERVE_SPACE(52); - WRITE32(OP_LOCKT); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = reserve_space(xdr, 52); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + p = xdr_encode_hyper(p, args->lock_owner.clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, args->lock_owner.id); hdr->nops++; hdr->replen += decode_lockt_maxsz; } @@ -1099,13 +1076,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar { __be32 *p; - RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); - WRITE32(OP_LOCKU); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE32(args->seqid->sequence->counter); - WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); + p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); hdr->nops++; hdr->replen += decode_locku_maxsz; } @@ -1115,10 +1092,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc int len = name->len; __be32 *p; - RESERVE_SPACE(8 + len); - WRITE32(OP_LOOKUP); - WRITE32(len); - WRITEMEM(name->name, len); + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_LOOKUP); + xdr_encode_opaque(p, name->name, len); hdr->nops++; hdr->replen += decode_lookup_maxsz; } @@ -1127,21 +1103,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) { __be32 *p; - RESERVE_SPACE(8); + p = reserve_space(xdr, 8); switch (fmode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: - WRITE32(NFS4_SHARE_ACCESS_READ); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ); break; case FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_WRITE); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE); break; case FMODE_READ|FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_BOTH); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH); break; default: - WRITE32(0); + *p++ = cpu_to_be32(0); } - WRITE32(0); /* for linux, share_deny = 0 always */ + *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ } static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) @@ -1151,29 +1127,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, * owner 4 = 32 */ - RESERVE_SPACE(8); - WRITE32(OP_OPEN); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_OPEN); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); - RESERVE_SPACE(28); - WRITE64(arg->clientid); - WRITE32(16); - WRITEMEM("open id:", 8); - WRITE64(arg->id); + p = reserve_space(xdr, 28); + p = xdr_encode_hyper(p, arg->clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "open id:", 8); + xdr_encode_hyper(p, arg->id); } static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch(arg->open_flags & O_EXCL) { case 0: - WRITE32(NFS4_CREATE_UNCHECKED); + *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); encode_attrs(xdr, arg->u.attrs, arg->server); break; default: - WRITE32(NFS4_CREATE_EXCLUSIVE); + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); encode_nfs4_verifier(xdr, &arg->u.verifier); } } @@ -1182,14 +1158,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (arg->open_flags & O_CREAT) { case 0: - WRITE32(NFS4_OPEN_NOCREATE); + *p = cpu_to_be32(NFS4_OPEN_NOCREATE); break; default: BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); - WRITE32(NFS4_OPEN_CREATE); + *p = cpu_to_be32(NFS4_OPEN_CREATE); encode_createmode(xdr, arg); } } @@ -1198,16 +1174,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (delegation_type) { case 0: - WRITE32(NFS4_OPEN_DELEGATE_NONE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); break; case FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_READ); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); break; case FMODE_WRITE|FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_WRITE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); break; default: BUG(); @@ -1218,8 +1194,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_NULL); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); encode_string(xdr, name->len, name->name); } @@ -1227,8 +1203,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); encode_delegation_type(xdr, type); } @@ -1236,9 +1212,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); encode_string(xdr, name->len, name->name); } @@ -1267,10 +1243,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_CONFIRM); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); hdr->nops++; hdr->replen += decode_open_confirm_maxsz; } @@ -1279,10 +1255,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_DOWNGRADE); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); hdr->nops++; hdr->replen += decode_open_downgrade_maxsz; @@ -1294,10 +1270,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd int len = fh->size; __be32 *p; - RESERVE_SPACE(8 + len); - WRITE32(OP_PUTFH); - WRITE32(len); - WRITEMEM(fh->data, len); + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_PUTFH); + xdr_encode_opaque(p, fh->data, len); hdr->nops++; hdr->replen += decode_putfh_maxsz; } @@ -1306,8 +1281,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_PUTROOTFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_PUTROOTFH); hdr->nops++; hdr->replen += decode_putrootfh_maxsz; } @@ -1317,26 +1292,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context nfs4_stateid stateid; __be32 *p; - RESERVE_SPACE(NFS4_STATEID_SIZE); + p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); - WRITEMEM(stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); } static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_READ); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); encode_stateid(xdr, args->context); - RESERVE_SPACE(12); - WRITE64(args->offset); - WRITE32(args->count); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_read_maxsz; } @@ -1349,20 +1324,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg }; __be32 *p; - RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); - WRITE32(OP_READDIR); - WRITE64(readdir->cookie); - WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); - WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ - WRITE32(readdir->count); - WRITE32(2); + p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); + *p++ = cpu_to_be32(OP_READDIR); + p = xdr_encode_hyper(p, readdir->cookie); + p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ + *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(2); /* Switch to mounted_on_fileid if the server supports it */ if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) attrs[0] &= ~FATTR4_WORD0_FILEID; else attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - WRITE32(attrs[0] & readdir->bitmask[0]); - WRITE32(attrs[1] & readdir->bitmask[1]); + *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); + *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); hdr->nops++; hdr->replen += decode_readdir_maxsz; dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", @@ -1378,8 +1353,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink * { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_READLINK); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READLINK); hdr->nops++; hdr->replen += decode_readlink_maxsz; } @@ -1388,10 +1363,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc { __be32 *p; - RESERVE_SPACE(8 + name->len); - WRITE32(OP_REMOVE); - WRITE32(name->len); - WRITEMEM(name->name, name->len); + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_REMOVE); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_remove_maxsz; } @@ -1400,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co { __be32 *p; - RESERVE_SPACE(8 + oldname->len); - WRITE32(OP_RENAME); - WRITE32(oldname->len); - WRITEMEM(oldname->name, oldname->len); - - RESERVE_SPACE(4 + newname->len); - WRITE32(newname->len); - WRITEMEM(newname->name, newname->len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RENAME); + encode_string(xdr, oldname->len, oldname->name); + encode_string(xdr, newname->len, newname->name); hdr->nops++; hdr->replen += decode_rename_maxsz; } @@ -1416,9 +1386,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client { __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_RENEW); - WRITE64(client_stateid->cl_clientid); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_RENEW); + xdr_encode_hyper(p, client_stateid->cl_clientid); hdr->nops++; hdr->replen += decode_renew_maxsz; } @@ -1428,8 +1398,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_RESTOREFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RESTOREFH); hdr->nops++; hdr->replen += decode_restorefh_maxsz; } @@ -1439,16 +1409,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); - RESERVE_SPACE(2*4); - WRITE32(1); - WRITE32(FATTR4_WORD0_ACL); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); if (arg->acl_len % 4) return -EINVAL; - RESERVE_SPACE(4); - WRITE32(arg->acl_len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); hdr->nops++; hdr->replen += decode_setacl_maxsz; @@ -1460,8 +1430,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_SAVEFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_SAVEFH); hdr->nops++; hdr->replen += decode_savefh_maxsz; } @@ -1470,9 +1440,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_setattr_maxsz; encode_attrs(xdr, arg->iap, server); @@ -1482,17 +1452,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie { __be32 *p; - RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID); - WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID); + xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_prog); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_cb_ident); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_cb_ident); hdr->nops++; hdr->replen += decode_setclientid_maxsz; } @@ -1501,10 +1471,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_ { __be32 *p; - RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID_CONFIRM); - WRITE64(client_state->cl_clientid); - WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); + p = xdr_encode_hyper(p, client_state->cl_clientid); + xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); hdr->nops++; hdr->replen += decode_setclientid_confirm_maxsz; } @@ -1513,15 +1483,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_WRITE); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); encode_stateid(xdr, args->context); - RESERVE_SPACE(16); - WRITE64(args->offset); - WRITE32(args->stable); - WRITE32(args->count); + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); xdr_write_pages(xdr, args->pages, args->pgbase, args->count); hdr->nops++; @@ -1532,10 +1502,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); - WRITE32(OP_DELEGRETURN); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_DELEGRETURN); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_delegreturn_maxsz; } @@ -1548,16 +1518,16 @@ static void encode_exchange_id(struct xdr_stream *xdr, { __be32 *p; - RESERVE_SPACE(4 + sizeof(args->verifier->data)); - WRITE32(OP_EXCHANGE_ID); - WRITEMEM(args->verifier->data, sizeof(args->verifier->data)); + p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); + *p++ = cpu_to_be32(OP_EXCHANGE_ID); + xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); encode_string(xdr, args->id_len, args->id); - RESERVE_SPACE(12); - WRITE32(args->flags); - WRITE32(0); /* zero length state_protect4_a */ - WRITE32(0); /* zero length implementation id array */ + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(args->flags); + *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ + *p = cpu_to_be32(0); /* zero length implementation id array */ hdr->nops++; hdr->replen += decode_exchange_id_maxsz; } @@ -1571,55 +1541,43 @@ static void encode_create_session(struct xdr_stream *xdr, uint32_t len; struct nfs_client *clp = args->client; - RESERVE_SPACE(4); - WRITE32(OP_CREATE_SESSION); - - RESERVE_SPACE(8); - WRITE64(clp->cl_ex_clid); + len = scnprintf(machine_name, sizeof(machine_name), "%s", + clp->cl_ipaddr); - RESERVE_SPACE(8); - WRITE32(clp->cl_seqid); /*Sequence id */ - WRITE32(args->flags); /*flags */ + p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); + *p++ = cpu_to_be32(OP_CREATE_SESSION); + p = xdr_encode_hyper(p, clp->cl_ex_clid); + *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ + *p++ = cpu_to_be32(args->flags); /*flags */ - RESERVE_SPACE(2*28); /* 2 channel_attrs */ /* Fore Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->fc_attrs.max_ops); /* max operations */ - WRITE32(args->fc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ /* Back Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->bc_attrs.max_ops); /* max operations */ - WRITE32(args->bc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ - - RESERVE_SPACE(4); - WRITE32(args->cb_program); /* cb_program */ - - RESERVE_SPACE(4); /* # of security flavors */ - WRITE32(1); - - RESERVE_SPACE(4); - WRITE32(RPC_AUTH_UNIX); /* auth_sys */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + *p++ = cpu_to_be32(args->cb_program); /* cb_program */ + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - RESERVE_SPACE(4); - WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ - len = scnprintf(machine_name, sizeof(machine_name), "%s", - clp->cl_ipaddr); - RESERVE_SPACE(16 + len); - WRITE32(len); - WRITEMEM(machine_name, len); - WRITE32(0); /* UID */ - WRITE32(0); /* GID */ - WRITE32(0); /* No more gids */ + *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ + p = xdr_encode_opaque(p, machine_name, len); + *p++ = cpu_to_be32(0); /* UID */ + *p++ = cpu_to_be32(0); /* GID */ + *p = cpu_to_be32(0); /* No more gids */ hdr->nops++; hdr->replen += decode_create_session_maxsz; } @@ -1629,9 +1587,9 @@ static void encode_destroy_session(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); - WRITE32(OP_DESTROY_SESSION); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(OP_DESTROY_SESSION); + xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); hdr->nops++; hdr->replen += decode_destroy_session_maxsz; } @@ -1655,8 +1613,8 @@ static void encode_sequence(struct xdr_stream *xdr, WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); slot = tp->slots + args->sa_slotid; - RESERVE_SPACE(4); - WRITE32(OP_SEQUENCE); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); + *p++ = cpu_to_be32(OP_SEQUENCE); /* * Sessionid + seqid + slotid + max slotid + cache_this @@ -1670,12 +1628,11 @@ static void encode_sequence(struct xdr_stream *xdr, ((u32 *)session->sess_id.data)[3], slot->seq_nr, args->sa_slotid, tp->highest_used_slotid, args->sa_cache_this); - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(slot->seq_nr); - WRITE32(args->sa_slotid); - WRITE32(tp->highest_used_slotid); - WRITE32(args->sa_cache_this); + p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(slot->seq_nr); + *p++ = cpu_to_be32(args->sa_slotid); + *p++ = cpu_to_be32(tp->highest_used_slotid); + *p = cpu_to_be32(args->sa_cache_this); hdr->nops++; hdr->replen += decode_sequence_maxsz; #endif /* CONFIG_NFS_V4_1 */ @@ -2466,68 +2423,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, } #endif /* CONFIG_NFS_V4_1 */ -/* - * START OF "GENERIC" DECODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define READ32(x) (x) = ntohl(*p++) -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) -#define READTIME(x) do { \ - p++; \ - (x.tv_sec) = ntohl(*p++); \ - (x.tv_nsec) = ntohl(*p++); \ -} while (0) -#define COPYMEM(x,nbytes) do { \ - memcpy((x), p, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -} while (0) - -#define READ_BUF(nbytes) do { \ - p = xdr_inline_decode(xdr, nbytes); \ - if (unlikely(!p)) { \ - dprintk("nfs: %s: prematurely hit end of receive" \ - " buffer\n", __func__); \ - dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ - __func__, xdr->p, nbytes, xdr->end); \ - return -EIO; \ - } \ -} while (0) +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { __be32 *p; - READ_BUF(4); - READ32(*len); - READ_BUF(*len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, *len); + if (unlikely(!p)) + goto out_overflow; *string = (char *)p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - READ_BUF(8); - READ32(hdr->status); - READ32(hdr->taglen); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + hdr->taglen = be32_to_cpup(p); - READ_BUF(hdr->taglen + 4); + p = xdr_inline_decode(xdr, hdr->taglen + 4); + if (unlikely(!p)) + goto out_overflow; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); - READ32(hdr->nops); + hdr->nops = be32_to_cpup(p); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) @@ -2536,18 +2478,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); if (opnum != expected) { dprintk("nfs: Server returned operation" " %d but we issued a request for %d\n", opnum, expected); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p); if (nfserr != NFS_OK) return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* Dummy routine */ @@ -2557,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) unsigned int strlen; char *str; - READ_BUF(12); - return decode_opaque_inline(xdr, &strlen, &str); + p = xdr_inline_decode(xdr, 12); + if (likely(p)) + return decode_opaque_inline(xdr, &strlen, &str); + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) @@ -2566,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) uint32_t bmlen; __be32 *p; - READ_BUF(4); - READ32(bmlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); bitmap[0] = bitmap[1] = 0; - READ_BUF((bmlen << 2)); + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; if (bmlen > 0) { - READ32(bitmap[0]); + bitmap[0] = be32_to_cpup(p++); if (bmlen > 1) - READ32(bitmap[1]); + bitmap[1] = be32_to_cpup(p); } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) { __be32 *p; - READ_BUF(4); - READ32(*attrlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *attrlen = be32_to_cpup(p); *savep = xdr->p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) @@ -2609,8 +2571,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { - READ_BUF(4); - READ32(*type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); return -EIO; @@ -2620,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) @@ -2631,14 +2598,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { - READ_BUF(8); - READ64(*change); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; } dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) @@ -2650,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { - READ_BUF(8); - READ64(*size); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2667,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2683,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) @@ -2701,9 +2688,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { - READ_BUF(16); - READ64(fsid->major); - READ64(fsid->minor); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &fsid->major); + xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; ret = NFS_ATTR_FATTR_FSID; } @@ -2711,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs (unsigned long long)fsid->major, (unsigned long long)fsid->minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2721,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2737,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2754,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2772,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2790,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2807,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2824,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -2838,8 +2865,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) __be32 *p; int status = 0; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n == 0) goto root_path; dprintk("path "); @@ -2873,6 +2902,9 @@ out_eio: dprintk(" status %d", status); status = -EIO; goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -2890,8 +2922,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st status = decode_pathname(xdr, &res->fs_path); if (unlikely(status != 0)) goto out; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n <= 0) goto out_eio; res->nlocations = 0; @@ -2899,8 +2933,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st u32 m; struct nfs4_fs_location *loc = &res->locations[res->nlocations]; - READ_BUF(4); - READ32(m); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + m = be32_to_cpup(p); loc->nservers = 0; dprintk("%s: servers ", __func__); @@ -2939,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; +out_overflow: + print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -2953,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -2970,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { - READ_BUF(4); - READ32(*maxlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -2987,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { - READ_BUF(4); - READ32(*maxname); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3005,8 +3058,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { uint64_t maxread; - READ_BUF(8); - READ64(maxread); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; *res = (uint32_t)maxread; @@ -3014,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ } dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3026,8 +3084,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { uint64_t maxwrite; - READ_BUF(8); - READ64(maxwrite); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; *res = (uint32_t)maxwrite; @@ -3035,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 } dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) @@ -3047,14 +3110,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { - READ_BUF(4); - READ32(tmp); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + tmp = be32_to_cpup(p); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; ret = NFS_ATTR_FATTR_MODE; } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) @@ -3066,16 +3134,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { - READ_BUF(4); - READ32(*nlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) +static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_client *clp, uint32_t *uid, int may_sleep) { uint32_t len; __be32 *p; @@ -3085,10 +3159,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; else @@ -3101,9 +3181,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: uid=%d\n", __func__, (int)*uid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) +static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_client *clp, uint32_t *gid, int may_sleep) { uint32_t len; __be32 *p; @@ -3113,10 +3197,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; else @@ -3129,6 +3219,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: gid=%d\n", __func__, (int)*gid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) @@ -3143,9 +3236,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { dev_t tmp; - READ_BUF(8); - READ32(major); - READ32(minor); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + major = be32_to_cpup(p++); + minor = be32_to_cpup(p); tmp = MKDEV(major, minor); if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; @@ -3154,6 +3249,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3165,12 +3263,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3182,12 +3285,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3199,12 +3307,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) @@ -3216,14 +3329,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { - READ_BUF(8); - READ64(*used); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; } dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) @@ -3232,12 +3350,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) uint64_t sec; uint32_t nsec; - READ_BUF(12); - READ64(sec); - READ32(nsec); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &sec); + nsec = be32_to_cpup(p); time->tv_sec = (time_t)sec; time->tv_nsec = (long)nsec; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -3315,11 +3438,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c { __be32 *p; - READ_BUF(20); - READ32(cinfo->atomic); - READ64(cinfo->before); - READ64(cinfo->after); + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + cinfo->atomic = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &cinfo->before); + xdr_decode_hyper(p, &cinfo->after); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) @@ -3331,40 +3459,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) status = decode_op_hdr(xdr, OP_ACCESS); if (status) return status; - READ_BUF(8); - READ32(supp); - READ32(acc); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + supp = be32_to_cpup(p++); + acc = be32_to_cpup(p); access->supported = supp; access->access = acc; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { __be32 *p; + + p = xdr_inline_decode(xdr, len); + if (likely(p)) { + memcpy(buf, p, len); + return 0; + } + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); +} + +static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +{ int status; status = decode_op_hdr(xdr, OP_CLOSE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_verifier(struct xdr_stream *xdr, void *verifier) +{ + return decode_opaque_fixed(xdr, verifier, 8); } static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_COMMIT); - if (status) - return status; - READ_BUF(8); - COPYMEM(res->verf->verifier, 8); - return 0; + if (!status) + status = decode_verifier(xdr, res->verf->verifier); + return status; } static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3378,10 +3528,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; if ((status = decode_change_info(xdr, cinfo))) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) @@ -3466,7 +3622,8 @@ xdr_error: return status; } -static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) +static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + const struct nfs_server *server, int may_sleep) { __be32 *savep; uint32_t attrlen, @@ -3538,12 +3695,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons goto xdr_error; fattr->valid |= status; - status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); + status = decode_attr_owner(xdr, bitmap, server->nfs_client, + &fattr->uid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; - status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); + status = decode_attr_group(xdr, bitmap, server->nfs_client, + &fattr->gid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; @@ -3633,14 +3792,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) if (status) return status; - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; - READ_BUF(len); - COPYMEM(fh->data, len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, len); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3662,10 +3828,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) __be32 *p; uint32_t namelen, type; - READ_BUF(32); - READ64(offset); - READ64(length); - READ32(type); + p = xdr_inline_decode(xdr, 32); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &offset); + p = xdr_decode_hyper(p, &length); + type = be32_to_cpup(p++); if (fl != NULL) { fl->fl_start = (loff_t)offset; fl->fl_end = fl->fl_start + (loff_t)length - 1; @@ -3676,23 +3844,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_type = F_RDLCK; fl->fl_pid = 0; } - READ64(clientid); - READ32(namelen); - READ_BUF(namelen); - return -NFS4ERR_DENIED; + p = xdr_decode_hyper(p, &clientid); + namelen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, namelen); + if (likely(p)) + return -NFS4ERR_DENIED; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCK); if (status == -EIO) goto out; if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + goto out; } else if (status == -NFS4ERR_DENIED) status = decode_lock_denied(xdr, NULL); if (res->open_seqid != NULL) @@ -3713,16 +3885,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res) static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCKU); if (status != -EIO) nfs_increment_lock_seqid(status, res->seqid); - if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - } + if (status == 0) + status = decode_stateid(xdr, &res->stateid); return status; } @@ -3737,34 +3906,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) __be32 *p; uint32_t limit_type, nblocks, blocksize; - READ_BUF(12); - READ32(limit_type); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + limit_type = be32_to_cpup(p++); switch (limit_type) { case 1: - READ64(*maxsize); + xdr_decode_hyper(p, maxsize); break; case 2: - READ32(nblocks); - READ32(blocksize); + nblocks = be32_to_cpup(p++); + blocksize = be32_to_cpup(p); *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) { __be32 *p; uint32_t delegation_type; + int status; - READ_BUF(4); - READ32(delegation_type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + delegation_type = be32_to_cpup(p); if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { res->delegation_type = 0; return 0; } - READ_BUF(NFS4_STATEID_SIZE+4); - COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); - READ32(res->do_recall); + status = decode_stateid(xdr, &res->delegation); + if (unlikely(status)) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->do_recall = be32_to_cpup(p); switch (delegation_type) { case NFS4_OPEN_DELEGATE_READ: @@ -3776,6 +3957,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -3787,23 +3971,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) status = decode_op_hdr(xdr, OP_OPEN); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) + if (!status) + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); decode_change_info(xdr, &res->cinfo); - READ_BUF(8); - READ32(res->rflags); - READ32(bmlen); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->rflags = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); if (bmlen > 10) goto xdr_error; - READ_BUF(bmlen << 2); + p = xdr_inline_decode(xdr, bmlen << 2); + if (unlikely(!p)) + goto out_overflow; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) - READ32(res->attrset[i]); + res->attrset[i] = be32_to_cpup(p++); for (; i < NFS4_BITMAP_SIZE; i++) res->attrset[i] = 0; @@ -3811,36 +3999,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_putfh(struct xdr_stream *xdr) @@ -3863,9 +4048,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ status = decode_op_hdr(xdr, OP_READ); if (status) return status; - READ_BUF(8); - READ32(eof); - READ32(count); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + eof = be32_to_cpup(p++); + count = be32_to_cpup(p); hdrlen = (u8 *) p - (u8 *) iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { @@ -3878,6 +4065,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ res->eof = eof; res->count = count; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) @@ -3892,17 +4082,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n int status; status = decode_op_hdr(xdr, OP_READDIR); - if (status) + if (!status) + status = decode_verifier(xdr, readdir->verifier.data); + if (unlikely(status)) return status; - READ_BUF(8); - COPYMEM(readdir->verifier.data, 8); dprintk("%s: verifier = %08x:%08x\n", __func__, ((u32 *)readdir->verifier.data)[0], ((u32 *)readdir->verifier.data)[1]); - hdrlen = (char *) p - (char *) iov->iov_base; + hdrlen = (char *) xdr->p - (char *) iov->iov_base; recvd = rcvbuf->len - hdrlen; if (pglen > recvd) pglen = recvd; @@ -3990,8 +4180,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) return status; /* Convert length of symlink */ - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; @@ -4015,6 +4207,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) kaddr[len+rcvbuf->page_base] = '\0'; kunmap_atomic(kaddr, KM_USER0); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -4112,10 +4307,16 @@ static int decode_setattr(struct xdr_stream *xdr) status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) @@ -4124,35 +4325,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" " %d\n", opnum); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p); if (nfserr == NFS_OK) { - READ_BUF(8 + NFS4_VERIFIER_SIZE); - READ64(clp->cl_clientid); - COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &clp->cl_clientid); + memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; /* skip netid string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; /* skip uaddr string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; return -NFSERR_CLID_INUSE; } else return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -4169,11 +4385,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) if (status) return status; - READ_BUF(16); - READ32(res->count); - READ32(res->verf->committed); - COPYMEM(res->verf->verifier, 8); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + res->count = be32_to_cpup(p++); + res->verf->committed = be32_to_cpup(p++); + memcpy(res->verf->verifier, p, 8); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -4187,6 +4408,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, { __be32 *p; uint32_t dummy; + char *dummy_str; int status; struct nfs_client *clp = res->client; @@ -4194,36 +4416,45 @@ static int decode_exchange_id(struct xdr_stream *xdr, if (status) return status; - READ_BUF(8); - READ64(clp->cl_ex_clid); - READ_BUF(12); - READ32(clp->cl_seqid); - READ32(clp->cl_exchange_flags); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &clp->cl_ex_clid); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + clp->cl_exchange_flags = be32_to_cpup(p++); /* We ask for SP4_NONE */ - READ32(dummy); + dummy = be32_to_cpup(p); if (dummy != SP4_NONE) return -EIO; /* Throw away minor_id */ - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; /* Throw away Major id */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away server_scope */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away Implementation id array */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_chan_attrs(struct xdr_stream *xdr, @@ -4232,22 +4463,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr, __be32 *p; u32 nr_attrs; - READ_BUF(28); - READ32(attrs->headerpadsz); - READ32(attrs->max_rqst_sz); - READ32(attrs->max_resp_sz); - READ32(attrs->max_resp_sz_cached); - READ32(attrs->max_ops); - READ32(attrs->max_reqs); - READ32(nr_attrs); + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + attrs->headerpadsz = be32_to_cpup(p++); + attrs->max_rqst_sz = be32_to_cpup(p++); + attrs->max_resp_sz = be32_to_cpup(p++); + attrs->max_resp_sz_cached = be32_to_cpup(p++); + attrs->max_ops = be32_to_cpup(p++); + attrs->max_reqs = be32_to_cpup(p++); + nr_attrs = be32_to_cpup(p); if (unlikely(nr_attrs > 1)) { printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", __func__, nr_attrs); return -EINVAL; } - if (nr_attrs == 1) - READ_BUF(4); /* skip rdma_attrs */ + if (nr_attrs == 1) { + p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ + if (unlikely(!p)) + goto out_overflow; + } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) +{ + return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN); } static int decode_create_session(struct xdr_stream *xdr, @@ -4259,24 +4503,26 @@ static int decode_create_session(struct xdr_stream *xdr, struct nfs4_session *session = clp->cl_session; status = decode_op_hdr(xdr, OP_CREATE_SESSION); - - if (status) + if (!status) + status = decode_sessionid(xdr, &session->sess_id); + if (unlikely(status)) return status; - /* sessionid */ - READ_BUF(NFS4_MAX_SESSIONID_LEN); - COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN); - /* seqid, flags */ - READ_BUF(8); - READ32(clp->cl_seqid); - READ32(session->flags); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + session->flags = be32_to_cpup(p); /* Channel attributes */ status = decode_chan_attrs(xdr, &session->fc_attrs); if (!status) status = decode_chan_attrs(xdr, &session->bc_attrs); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) @@ -4300,7 +4546,9 @@ static int decode_sequence(struct xdr_stream *xdr, return 0; status = decode_op_hdr(xdr, OP_SEQUENCE); - if (status) + if (!status) + status = decode_sessionid(xdr, &id); + if (unlikely(status)) goto out_err; /* @@ -4309,36 +4557,43 @@ static int decode_sequence(struct xdr_stream *xdr, */ status = -ESERVERFAULT; - slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; - READ_BUF(NFS4_MAX_SESSIONID_LEN + 20); - COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN); if (memcmp(id.data, res->sr_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("%s Invalid session id\n", __func__); goto out_err; } + + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + /* seqid */ - READ32(dummy); + slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; + dummy = be32_to_cpup(p++); if (dummy != slot->seq_nr) { dprintk("%s Invalid sequence number\n", __func__); goto out_err; } /* slot id */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy != res->sr_slotid) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } /* highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* target highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* result flags - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p); status = 0; out_err: res->sr_status = status; return status; +out_overflow: + print_overflow_msg(__func__, xdr); + status = -EIO; + goto out_err; #else /* CONFIG_NFS_V4_1 */ return 0; #endif /* CONFIG_NFS_V4_1 */ @@ -4370,7 +4625,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct status = decode_open_downgrade(&xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4397,7 +4653,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac status = decode_access(&xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4424,7 +4681,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo goto out; if ((status = decode_getfh(&xdr, res->fh)) != 0) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server + ,!RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4448,7 +4706,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf if ((status = decode_putrootfh(&xdr)) != 0) goto out; if ((status = decode_getfh(&xdr, res->fh)) == 0) - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4473,7 +4732,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem goto out; if ((status = decode_remove(&xdr, &res->cinfo)) != 0) goto out; - decode_getfattr(&xdr, &res->dir_attr, res->server); + decode_getfattr(&xdr, &res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4503,11 +4763,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) goto out; /* Current FH is target directory */ - if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0) + if (decode_getfattr(&xdr, res->new_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->old_fattr, res->server); + decode_getfattr(&xdr, res->old_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4540,11 +4802,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link * Note order: OP_LINK leaves the directory as the current * filehandle. */ - if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0) + if (decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4573,11 +4837,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr goto out; if ((status = decode_getfh(&xdr, res->fh)) != 0) goto out; - if (decode_getfattr(&xdr, res->fattr, res->server) != 0) + if (decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->dir_fattr, res->server); + decode_getfattr(&xdr, res->dir_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4609,7 +4875,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g status = decode_putfh(&xdr); if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4716,7 +4983,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos * an ESTALE error. Shouldn't be a problem, * though, since fattr->valid will remain unset. */ - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4748,11 +5016,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr goto out; if (decode_getfh(&xdr, &res->fh) != 0) goto out; - if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) + if (decode_getfattr(&xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if (decode_restorefh(&xdr) != 0) goto out; - decode_getfattr(&xdr, res->dir_attr, res->server); + decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4800,7 +5070,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf status = decode_open(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->f_attr, res->server); + decode_getfattr(&xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4827,7 +5098,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se status = decode_setattr(&xdr); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5001,7 +5273,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ status = decode_write(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); if (!status) status = res->count; out: @@ -5030,7 +5303,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri status = decode_commit(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5194,7 +5468,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf if (status != 0) goto out; status = decode_delegreturn(&xdr); - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5222,7 +5497,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, goto out; xdr_enter_page(&xdr, PAGE_SIZE); status = decode_getfattr(&xdr, &res->fs_locations->fattr, - res->fs_locations->server); + res->fs_locations->server, + !RPC_IS_ASYNC(req->rq_task)); out: return status; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0b4cbdc60ab..867f7050453 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -73,7 +73,7 @@ enum { Opt_cto, Opt_nocto, Opt_ac, Opt_noac, Opt_lock, Opt_nolock, - Opt_v2, Opt_v3, + Opt_v2, Opt_v3, Opt_v4, Opt_udp, Opt_tcp, Opt_rdma, Opt_acl, Opt_noacl, Opt_rdirplus, Opt_nordirplus, @@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_nolock, "nolock" }, { Opt_v2, "v2" }, { Opt_v3, "v3" }, + { Opt_v4, "v4" }, { Opt_udp, "udp" }, { Opt_tcp, "tcp" }, { Opt_rdma, "rdma" }, @@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_mountvers, "mountvers=%s" }, { Opt_nfsvers, "nfsvers=%s" }, { Opt_nfsvers, "vers=%s" }, - { Opt_minorversion, "minorversion=%u" }, + { Opt_minorversion, "minorversion=%s" }, { Opt_sec, "sec=%s" }, { Opt_proto, "proto=%s" }, @@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = { }; #ifdef CONFIG_NFS_V4 +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, const char *dev_name); +static int nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data, struct vfsmount *mnt); static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static int nfs4_remote_get_sb(struct file_system_type *fs_type, @@ -742,127 +747,23 @@ static int nfs_verify_server_address(struct sockaddr *addr) } } + dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); return 0; } -static void nfs_parse_ipv4_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - u8 *addr = (u8 *)&sin->sin_addr.s_addr; - - if (str_len <= INET_ADDRSTRLEN) { - dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n", - (int)str_len, string); - - sin->sin_family = AF_INET; - *addr_len = sizeof(*sin); - if (in4_pton(string, str_len, addr, '\0', NULL)) - return; - } - - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, - const char *delim, - struct sockaddr_in6 *sin6) -{ - char *p; - size_t len; - - if ((string + str_len) == delim) - return 1; - - if (*delim != IPV6_SCOPE_DELIMITER) - return 0; - - if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) - return 0; - - len = (string + str_len) - delim - 1; - p = kstrndup(delim + 1, len, GFP_KERNEL); - if (p) { - unsigned long scope_id = 0; - struct net_device *dev; - - dev = dev_get_by_name(&init_net, p); - if (dev != NULL) { - scope_id = dev->ifindex; - dev_put(dev); - } else { - if (strict_strtoul(p, 10, &scope_id) == 0) { - kfree(p); - return 0; - } - } - - kfree(p); - - sin6->sin6_scope_id = scope_id; - dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); - return 1; - } - - return 0; -} - -static void nfs_parse_ipv6_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - u8 *addr = (u8 *)&sin6->sin6_addr.in6_u; - const char *delim; - - if (str_len <= INET6_ADDRSTRLEN) { - dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n", - (int)str_len, string); - - sin6->sin6_family = AF_INET6; - *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, - IPV6_SCOPE_DELIMITER, &delim) != 0) { - if (nfs_parse_ipv6_scope_id(string, str_len, - delim, sin6) != 0) - return; - } - } - - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} -#else -static void nfs_parse_ipv6_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} -#endif - /* - * Construct a sockaddr based on the contents of a string that contains - * an IP address in presentation format. - * - * If there is a problem constructing the new sockaddr, set the address - * family to AF_UNSPEC. + * Select between a default port value and a user-specified port value. + * If a zero value is set, then autobind will be used. */ -void nfs_parse_ip_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) +static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, + const unsigned short default_port) { - unsigned int i, colons; + unsigned short port = default_port; - colons = 0; - for (i = 0; i < str_len; i++) - if (string[i] == ':') - colons++; + if (parsed_port != NFS_UNSPEC_PORT) + port = parsed_port; - if (colons >= 2) - nfs_parse_ipv6_address(string, str_len, sap, addr_len); - else - nfs_parse_ipv4_address(string, str_len, sap, addr_len); + rpc_set_port(sap, port); } /* @@ -904,8 +805,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) /* * Parse the value of the 'sec=' option. - * - * The flavor_len setting is for v4 mounts. */ static int nfs_parse_security_flavors(char *value, struct nfs_parsed_mount_data *mnt) @@ -916,53 +815,43 @@ static int nfs_parse_security_flavors(char *value, switch (match_token(value, nfs_secflavor_tokens, args)) { case Opt_sec_none: - mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_NULL; break; case Opt_sec_sys: - mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_UNIX; break; case Opt_sec_krb5: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; break; case Opt_sec_krb5i: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; break; case Opt_sec_krb5p: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; break; case Opt_sec_lkey: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; break; case Opt_sec_lkeyi: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; break; case Opt_sec_lkeyp: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; break; case Opt_sec_spkm: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; break; case Opt_sec_spkmi: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; break; case Opt_sec_spkmp: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; break; default: return 0; } + mnt->auth_flavor_len = 1; return 1; } @@ -1001,7 +890,6 @@ static int nfs_parse_mount_options(char *raw, while ((p = strsep(&raw, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; unsigned long option; - int int_option; int token; if (!*p) @@ -1047,10 +935,18 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_v2: mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; break; case Opt_v3: mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; break; +#ifdef CONFIG_NFS_V4 + case Opt_v4: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; + break; +#endif case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1264,20 +1160,33 @@ static int nfs_parse_mount_options(char *raw, switch (option) { case NFS2_VERSION: mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; break; case NFS3_VERSION: mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; break; +#ifdef CONFIG_NFS_V4 + case NFS4_VERSION: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; + break; +#endif default: goto out_invalid_value; } break; case Opt_minorversion: - if (match_int(args, &int_option)) - return 0; - if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION) - return 0; - mnt->minorversion = int_option; + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + rc = strict_strtoul(string, 10, &option); + kfree(string); + if (rc != 0) + goto out_invalid_value; + if (option > NFS4_MAX_MINOR_VERSION) + goto out_invalid_value; + mnt->minorversion = option; break; /* @@ -1352,11 +1261,14 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_ip_address(string, strlen(string), - (struct sockaddr *) - &mnt->nfs_server.address, - &mnt->nfs_server.addrlen); + mnt->nfs_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->nfs_server.address, + sizeof(mnt->nfs_server.address)); kfree(string); + if (mnt->nfs_server.addrlen == 0) + goto out_invalid_address; break; case Opt_clientaddr: string = match_strdup(args); @@ -1376,11 +1288,14 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_ip_address(string, strlen(string), - (struct sockaddr *) - &mnt->mount_server.address, - &mnt->mount_server.addrlen); + mnt->mount_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->mount_server.address, + sizeof(mnt->mount_server.address)); kfree(string); + if (mnt->mount_server.addrlen == 0) + goto out_invalid_address; break; case Opt_lookupcache: string = match_strdup(args); @@ -1432,8 +1347,11 @@ static int nfs_parse_mount_options(char *raw, return 1; +out_invalid_address: + printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); + return 0; out_invalid_value: - printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p); + printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); return 0; out_nomem: printk(KERN_INFO "NFS: not enough memory to parse option\n"); @@ -1445,13 +1363,60 @@ out_security_failure: } /* + * Match the requested auth flavors with the list returned by + * the server. Returns zero and sets the mount's authentication + * flavor on success; returns -EACCES if server does not support + * the requested flavor. + */ +static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, + struct nfs_mount_request *request) +{ + unsigned int i, j, server_authlist_len = *(request->auth_flav_len); + + /* + * Certain releases of Linux's mountd return an empty + * flavor list. To prevent behavioral regression with + * these servers (ie. rejecting mounts that used to + * succeed), revert to pre-2.6.32 behavior (no checking) + * if the returned flavor list is empty. + */ + if (server_authlist_len == 0) + return 0; + + /* + * We avoid sophisticated negotiating here, as there are + * plenty of cases where we can get it wrong, providing + * either too little or too much security. + * + * RFC 2623, section 2.7 suggests we SHOULD prefer the + * flavor listed first. However, some servers list + * AUTH_NULL first. Our caller plants AUTH_SYS, the + * preferred default, in args->auth_flavors[0] if user + * didn't specify sec= mount option. + */ + for (i = 0; i < args->auth_flavor_len; i++) + for (j = 0; j < server_authlist_len; j++) + if (args->auth_flavors[i] == request->auth_flavs[j]) { + dfprintk(MOUNT, "NFS: using auth flavor %d\n", + request->auth_flavs[j]); + args->auth_flavors[0] = request->auth_flavs[j]; + return 0; + } + + dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n"); + nfs_umount(request); + return -EACCES; +} + +/* * Use the remote server's MOUNT service to request the NFS file handle * corresponding to the provided path. */ static int nfs_try_mount(struct nfs_parsed_mount_data *args, struct nfs_fh *root_fh) { - unsigned int auth_flavor_len = 0; + rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS]; + unsigned int server_authlist_len = ARRAY_SIZE(server_authlist); struct nfs_mount_request request = { .sap = (struct sockaddr *) &args->mount_server.address, @@ -1459,7 +1424,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, .protocol = args->mount_server.protocol, .fh = root_fh, .noresvport = args->flags & NFS_MOUNT_NORESVPORT, - .auth_flav_len = &auth_flavor_len, + .auth_flav_len = &server_authlist_len, + .auth_flavs = server_authlist, }; int status; @@ -1485,23 +1451,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.addrlen = args->nfs_server.addrlen; } request.salen = args->mount_server.addrlen; - - /* - * autobind will be used if mount_server.port == 0 - */ - nfs_set_port(request.sap, args->mount_server.port); + nfs_set_default_port(request.sap, args->mount_server.port, 0); /* * Now ask the mount server to map our export path * to a file handle. */ status = nfs_mount(&request); - if (status == 0) - return 0; + if (status != 0) { + dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", + request.hostname, status); + return status; + } - dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", - request.hostname, status); - return status; + /* + * MNTv1 (NFSv2) does not support auth flavor negotiation. + */ + if (args->mount_server.version != NFS_MNT3_VERSION) + return 0; + return nfs_walk_authlist(args, &request); } static int nfs_parse_simple_hostname(const char *dev_name, @@ -1661,6 +1629,7 @@ static int nfs_validate_mount_data(void *options, const char *dev_name) { struct nfs_mount_data *data = (struct nfs_mount_data *)options; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; if (data == NULL) goto out_no_data; @@ -1672,10 +1641,12 @@ static int nfs_validate_mount_data(void *options, args->acregmax = NFS_DEF_ACREGMAX; args->acdirmin = NFS_DEF_ACDIRMIN; args->acdirmax = NFS_DEF_ACDIRMAX; - args->mount_server.port = 0; /* autobind unless user sets port */ - args->nfs_server.port = 0; /* autobind unless user sets port */ + args->mount_server.port = NFS_UNSPEC_PORT; + args->nfs_server.port = NFS_UNSPEC_PORT; args->nfs_server.protocol = XPRT_TRANSPORT_TCP; args->auth_flavors[0] = RPC_AUTH_UNIX; + args->auth_flavor_len = 1; + args->minorversion = 0; switch (data->version) { case 1: @@ -1697,8 +1668,11 @@ static int nfs_validate_mount_data(void *options, if (data->root.size > NFS3_FHSIZE || data->root.size == 0) goto out_invalid_fh; mntfh->size = data->root.size; - } else + args->version = 3; + } else { mntfh->size = NFS2_FHSIZE; + args->version = 2; + } memcpy(mntfh->data, data->root.data, mntfh->size); @@ -1720,11 +1694,9 @@ static int nfs_validate_mount_data(void *options, args->acdirmin = data->acdirmin; args->acdirmax = data->acdirmax; - memcpy(&args->nfs_server.address, &data->addr, - sizeof(data->addr)); + memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) @@ -1772,12 +1744,18 @@ static int nfs_validate_mount_data(void *options, if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; - nfs_set_port((struct sockaddr *)&args->nfs_server.address, - args->nfs_server.port); + if (args->version == 4) +#ifdef CONFIG_NFS_V4 + return nfs4_validate_text_mount_data(options, + args, dev_name); +#else + goto out_v4_not_compiled; +#endif + + nfs_set_default_port(sap, args->nfs_server.port, 0); nfs_set_mount_transport_protocol(args); @@ -1825,6 +1803,12 @@ out_v3_not_compiled: return -EPROTONOSUPPORT; #endif /* !CONFIG_NFS_V3 */ +#ifndef CONFIG_NFS_V4 +out_v4_not_compiled: + dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#endif /* !CONFIG_NFS_V4 */ + out_nomem: dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); return -ENOMEM; @@ -2120,6 +2104,14 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (error < 0) goto out; +#ifdef CONFIG_NFS_V4 + if (data->version == 4) { + error = nfs4_try_mount(flags, dev_name, data, mnt); + kfree(data->client_address); + goto out; + } +#endif /* CONFIG_NFS_V4 */ + /* Get a volume representation */ server = nfs_create_server(data, mntfh); if (IS_ERR(server)) { @@ -2317,6 +2309,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); } +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + + nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); + + nfs_validate_transport_protocol(args); + + nfs4_validate_mount_flags(args); + + if (args->version != 4) { + dfprintk(MOUNT, + "NFS4: Illegal mount version\n"); + return -EINVAL; + } + + if (args->auth_flavor_len > 1) { + dfprintk(MOUNT, + "NFS4: Too many RPC auth flavours specified\n"); + return -EINVAL; + } + + if (args->client_address == NULL) { + dfprintk(MOUNT, + "NFS4: mount program didn't pass callback address\n"); + return -EINVAL; + } + + return nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + NFS4_MAXNAMLEN, + &args->nfs_server.export_path, + NFS4_MAXPATHLEN); +} + /* * Validate NFSv4 mount options */ @@ -2324,7 +2353,7 @@ static int nfs4_validate_mount_data(void *options, struct nfs_parsed_mount_data *args, const char *dev_name) { - struct sockaddr_in *ap; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; @@ -2337,23 +2366,22 @@ static int nfs4_validate_mount_data(void *options, args->acregmax = NFS_DEF_ACREGMAX; args->acdirmin = NFS_DEF_ACDIRMIN; args->acdirmax = NFS_DEF_ACDIRMAX; - args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ + args->nfs_server.port = NFS_UNSPEC_PORT; args->auth_flavors[0] = RPC_AUTH_UNIX; - args->auth_flavor_len = 0; + args->auth_flavor_len = 1; + args->version = 4; args->minorversion = 0; switch (data->version) { case 1: - ap = (struct sockaddr_in *)&args->nfs_server.address; if (data->host_addrlen > sizeof(args->nfs_server.address)) goto out_no_address; if (data->host_addrlen == 0) goto out_no_address; args->nfs_server.addrlen = data->host_addrlen; - if (copy_from_user(ap, data->host_addr, data->host_addrlen)) + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) return -EFAULT; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; if (data->auth_flavourlen) { @@ -2399,39 +2427,14 @@ static int nfs4_validate_mount_data(void *options, nfs_validate_transport_protocol(args); break; - default: { - int status; - + default: if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) return -EINVAL; - nfs_set_port((struct sockaddr *)&args->nfs_server.address, - args->nfs_server.port); - - nfs_validate_transport_protocol(args); - - nfs4_validate_mount_flags(args); - - if (args->auth_flavor_len > 1) - goto out_inval_auth; - - if (args->client_address == NULL) - goto out_no_client_address; - - status = nfs_parse_devname(dev_name, - &args->nfs_server.hostname, - NFS4_MAXNAMLEN, - &args->nfs_server.export_path, - NFS4_MAXPATHLEN); - if (status < 0) - return status; - - break; - } + return nfs4_validate_text_mount_data(options, args, dev_name); } return 0; @@ -2448,10 +2451,6 @@ out_inval_auth: out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; - -out_no_client_address: - dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n"); - return -EINVAL; } /* @@ -2618,6 +2617,34 @@ out_err: return ret; } +static int nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data, + struct vfsmount *mnt) +{ + char *export_path; + struct vfsmount *root_mnt; + int error; + + dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + + export_path = data->nfs_server.export_path; + data->nfs_server.export_path = "/"; + root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, + data->nfs_server.hostname); + data->nfs_server.export_path = export_path; + + error = PTR_ERR(root_mnt); + if (IS_ERR(root_mnt)) + goto out; + + error = nfs_follow_remote_path(root_mnt, export_path, mnt); + +out: + dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error, + error != 0 ? " [error]" : ""); + return error; +} + /* * Get the superblock for an NFS4 mountpoint */ @@ -2625,8 +2652,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { struct nfs_parsed_mount_data *data; - char *export_path; - struct vfsmount *root_mnt; int error = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -2638,17 +2663,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, if (error < 0) goto out; - export_path = data->nfs_server.export_path; - data->nfs_server.export_path = "/"; - root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, - data->nfs_server.hostname); - data->nfs_server.export_path = export_path; - - error = PTR_ERR(root_mnt); - if (IS_ERR(root_mnt)) - goto out; - - error = nfs_follow_remote_path(root_mnt, export_path, mnt); + error = nfs4_try_mount(flags, dev_name, data, mnt); out: kfree(data->client_address); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index a34fae21fe1..120acadc6a8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -13,6 +13,7 @@ #include <linux/file.h> #include <linux/writeback.h> #include <linux/swap.h> +#include <linux/migrate.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> @@ -26,6 +27,7 @@ #include "internal.h" #include "iostat.h" #include "nfs4_fs.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -218,24 +220,17 @@ static void nfs_end_page_writeback(struct page *page) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -/* - * Find an associated nfs write request, and prepare to flush it out - * May return an error if the user signalled nfs_wait_on_request(). - */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) +static struct nfs_page *nfs_find_and_lock_request(struct page *page) { struct inode *inode = page->mapping->host; struct nfs_page *req; int ret; spin_lock(&inode->i_lock); - for(;;) { + for (;;) { req = nfs_page_find_request_locked(page); - if (req == NULL) { - spin_unlock(&inode->i_lock); - return 0; - } + if (req == NULL) + break; if (nfs_set_page_tag_locked(req)) break; /* Note: If we hold the page lock, as is the case in nfs_writepage, @@ -247,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, ret = nfs_wait_on_request(req); nfs_release_request(req); if (ret != 0) - return ret; + return ERR_PTR(ret); spin_lock(&inode->i_lock); } - if (test_bit(PG_CLEAN, &req->wb_flags)) { - spin_unlock(&inode->i_lock); - BUG(); - } - if (nfs_set_page_writeback(page) != 0) { - spin_unlock(&inode->i_lock); - BUG(); - } spin_unlock(&inode->i_lock); + return req; +} + +/* + * Find an associated nfs write request, and prepare to flush it out + * May return an error if the user signalled nfs_wait_on_request(). + */ +static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, + struct page *page) +{ + struct nfs_page *req; + int ret = 0; + + req = nfs_find_and_lock_request(page); + if (!req) + goto out; + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + ret = nfs_set_page_writeback(page); + BUG_ON(ret != 0); + BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); + if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); - return pgio->pg_error; + ret = pgio->pg_error; } - return 0; +out: + return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) @@ -1580,6 +1592,41 @@ int nfs_wb_page(struct inode *inode, struct page* page) return nfs_wb_page_priority(inode, page, FLUSH_STABLE); } +#ifdef CONFIG_MIGRATION +int nfs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page) +{ + struct nfs_page *req; + int ret; + + if (PageFsCache(page)) + nfs_fscache_release_page(page, GFP_KERNEL); + + req = nfs_find_and_lock_request(page); + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + ret = migrate_page(mapping, newpage, page); + if (!req) + goto out; + if (ret) + goto out_unlock; + page_cache_get(newpage); + req->wb_page = newpage; + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); +out_unlock: + nfs_clear_page_tag_locked(req); + nfs_release_request(req); +out: + return ret; +} +#endif + int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 5573508f707..36fcabbf518 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) int flags = nfsexp_flags(rqstp, exp); int ret; + validate_process_creds(); + /* discard any old override before preparing the new set */ revert_creds(get_cred(current->real_cred)); new = prepare_creds(); @@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); + validate_process_creds(); put_cred(override_creds(new)); put_cred(new); + validate_process_creds(); return 0; oom: diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b92a27629fb..d9462643155 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } +static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, expkey_request); +} + static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); static struct cache_detail svc_expkey_cache; @@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = { .hash_table = expkey_table, .name = "nfsd.fh", .cache_put = expkey_put, - .cache_request = expkey_request, + .cache_upcall = expkey_upcall, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, @@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } +static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); +} + static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); @@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = { .hash_table = export_table, .name = "nfsd.export", .cache_put = svc_export_put, - .cache_request = svc_export_request, + .cache_upcall = svc_export_upcall, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5b398421b05..cdfa86fa147 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -146,6 +146,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int +idtoname_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request); +} + +static int idtoname_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -175,10 +181,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) } static void -warn_no_idmapd(struct cache_detail *detail) +warn_no_idmapd(struct cache_detail *detail, int has_died) { printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", - detail->last_close? "died" : "not been started"); + has_died ? "died" : "not been started"); } @@ -192,7 +198,7 @@ static struct cache_detail idtoname_cache = { .hash_table = idtoname_table, .name = "nfs4.idtoname", .cache_put = ent_put, - .cache_request = idtoname_request, + .cache_upcall = idtoname_upcall, .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, @@ -325,6 +331,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int +nametoid_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request); +} + +static int nametoid_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -363,7 +375,7 @@ static struct cache_detail nametoid_cache = { .hash_table = nametoid_table, .name = "nfs4.nametoid", .cache_put = ent_put, - .cache_request = nametoid_request, + .cache_upcall = nametoid_upcall, .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6d0847562d8..7e906c5b767 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -37,6 +37,7 @@ #include <linux/nfsd/xdr.h> #include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> +#include <linux/sunrpc/clnt.h> #include <asm/uaccess.h> #include <net/ipv6.h> @@ -490,22 +491,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) * * Input: * buf: '\n'-terminated C string containing a - * presentation format IPv4 address + * presentation format IP address * size: length of C string in @buf * Output: * On success: returns zero if all specified locks were released; * returns one if one or more locks were not released * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in */ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) { - struct sockaddr_in sin = { - .sin_family = AF_INET, - }; - int b1, b2, b3, b4; - char c; + struct sockaddr_storage address; + struct sockaddr *sap = (struct sockaddr *)&address; + size_t salen = sizeof(address); char *fo_path; /* sanity check */ @@ -519,14 +516,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - /* get ipv4 address */ - if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) - return -EINVAL; - if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255) + if (rpc_pton(fo_path, size, sap, salen) == 0) return -EINVAL; - sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); + return nlmsvc_unlock_all_by_ip(sap); } /** diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 492c79b7800..24d58adfe5f 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -496,7 +496,9 @@ nfsd(void *vrqstp) /* Lock the export hash tables for reading. */ exp_readlock(); + validate_process_creds(); svc_process(rqstp); + validate_process_creds(); /* Unlock export hash tables */ exp_readunlock(); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 23341c1063b..8fa09bfbcba 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, __be32 err; int host_err; + validate_process_creds(); + /* * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE @@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, out_nfserr: err = nfserrno(host_err); out: + validate_process_creds(); return err; } diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 72da095d400..251da07b2a1 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,6 @@ config NILFS2_FS tristate "NILFS2 file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + depends on EXPERIMENTAL select CRC32 help NILFS2 is a log-structured file system (LFS) supporting continuous diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 99d58a028b9..08834df6ec6 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); } +/** + * nilfs_bmap_lookup_at_level - find a data block or node block + * @bmap: bmap + * @key: key + * @level: level + * @ptrp: place to store the value associated to @key + * + * Description: nilfs_bmap_lookup_at_level() finds a record whose key + * matches @key in the block at @level of the bmap. + * + * Return Value: On success, 0 is returned and the record associated with @key + * is stored in the place pointed by @ptrp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, __u64 *ptrp) { @@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, return ret; } -/** - * nilfs_bmap_lookup - find a record - * @bmap: bmap - * @key: key - * @recp: pointer to record - * - * Description: nilfs_bmap_lookup() finds a record whose key matches @key in - * @bmap. - * - * Return Value: On success, 0 is returned and the record associated with @key - * is stored in the place pointed by @recp. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - A record associated with @key does not exist. - */ -int nilfs_bmap_lookup(struct nilfs_bmap *bmap, - unsigned long key, - unsigned long *recp) -{ - __u64 ptr; - int ret; - - /* XXX: use macro for level 1 */ - ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr); - if (recp != NULL) - *recp = ptr; - return ret; -} - static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; @@ -469,104 +456,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) (entries_per_group / NILFS_BMAP_GROUP_DIV); } -int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, - sector_t blocknr) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - int ret; - - ret = nilfs_dat_prepare_start(dat, &req->bpr_req); - if (likely(!ret)) - nilfs_dat_commit_start(dat, &req->bpr_req, blocknr); - return ret; -} - -int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, - bmap->b_ptr_type == NILFS_BMAP_PTR_VS); -} - -void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr, - sector_t blocknr) -{ - return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr); -} - -int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr) -{ - return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr); -} - -int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - int ret; - - ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req); - if (ret < 0) - return ret; - ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req); - if (ret < 0) - nilfs_dat_abort_end(dat, &oldreq->bpr_req); - - return ret; -} - -void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - - nilfs_dat_commit_end(dat, &oldreq->bpr_req, - bmap->b_ptr_type == NILFS_BMAP_PTR_VS); - nilfs_dat_commit_alloc(dat, &newreq->bpr_req); -} - -void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - - nilfs_dat_abort_end(dat, &oldreq->bpr_req); - nilfs_dat_abort_alloc(dat, &newreq->bpr_req); -} - static struct lock_class_key nilfs_bmap_dat_lock_key; static struct lock_class_key nilfs_bmap_mdt_lock_key; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b2890cdcef1..9980d7dbab9 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -28,6 +28,7 @@ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> #include "alloc.h" +#include "dat.h" #define NILFS_BMAP_INVALID_PTR 0 @@ -141,7 +142,6 @@ struct nilfs_bmap { int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); -int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *); int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); @@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); +static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, + __u64 *ptr) +{ + return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr); +} + /* * Internal use only */ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); -int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - return nilfs_bmap_prepare_alloc_v(bmap, req); + if (dat) + return nilfs_dat_prepare_alloc(dat, &req->bpr_req); /* ignore target ptr */ req->bpr_ptr = bmap->b_last_allocated_ptr++; return 0; } static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_commit_alloc_v(bmap, req); + if (dat) + nilfs_dat_commit_alloc(dat, &req->bpr_req); } static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_abort_alloc_v(bmap, req); + if (dat) + nilfs_dat_abort_alloc(dat, &req->bpr_req); else bmap->b_last_allocated_ptr--; } -int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); - static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_bmap_prepare_end_v(bmap, req) : 0; + return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0; } static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_commit_end_v(bmap, req); + if (dat) + nilfs_dat_commit_end(dat, &req->bpr_req, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); } static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_abort_end_v(bmap, req); + if (dat) + nilfs_dat_abort_end(dat, &req->bpr_req); } -int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *, - sector_t); -int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t); -int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64); - - __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, const struct buffer_head *); __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); -int nilfs_bmap_prepare_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); - void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 7e0b61be212..c668bca579c 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, * We cannot call radix_tree_preload for the kernels older * than 2.6.23, because it is not exported for modules. */ +retry: err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (err) goto failed_unlock; @@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, (unsigned long long)oldkey, (unsigned long long)newkey); -retry: spin_lock_irq(&btnc->tree_lock); err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); spin_unlock_irq(&btnc->tree_lock); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index aa412724b64..e25b507a474 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void) kmem_cache_destroy(nilfs_btree_path_cache); } -static inline struct nilfs_btree_path * -nilfs_btree_alloc_path(const struct nilfs_btree *btree) +static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void) { - return (struct nilfs_btree_path *) - kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); + return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); } -static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static inline void nilfs_btree_free_path(struct nilfs_btree_path *path) { kmem_cache_free(nilfs_btree_path_cache, path); } -static void nilfs_btree_init_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static void nilfs_btree_init_path(struct nilfs_btree_path *path) { int level; @@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree, } } -static void nilfs_btree_clear_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static void nilfs_btree_release_path(struct nilfs_btree_path *path) { int level; - for (level = NILFS_BTREE_LEVEL_DATA; - level < NILFS_BTREE_LEVEL_MAX; - level++) { - if (path[level].bp_bh != NULL) { - brelse(path[level].bp_bh); - path[level].bp_bh = NULL; - } - /* sib_bh is released or deleted by prepare or commit - * operations. */ - path[level].bp_sib_bh = NULL; - path[level].bp_index = 0; - path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; - path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; - path[level].bp_op = NULL; - } + for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; + level++) + brelse(path[level].bp_bh); } /* @@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, } static inline int -nilfs_btree_node_get_flags(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) { return node->bn_flags; } static inline void -nilfs_btree_node_set_flags(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int flags) +nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) { node->bn_flags = flags; } -static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node) { - return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; + return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; } static inline int -nilfs_btree_node_get_level(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_level(const struct nilfs_btree_node *node) { return node->bn_level; } static inline void -nilfs_btree_node_set_level(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int level) +nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) { node->bn_level = level; } static inline int -nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) { return le16_to_cpu(node->bn_nchildren); } static inline void -nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int nchildren) +nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) { node->bn_nchildren = cpu_to_le16(nchildren); } -static inline int -nilfs_btree_node_size(const struct nilfs_btree *btree) +static inline int nilfs_btree_node_size(const struct nilfs_btree *btree) { return 1 << btree->bt_bmap.b_inode->i_blkbits; } static inline int -nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return nilfs_btree_node_root(btree, node) ? + return nilfs_btree_node_root(node) ? NILFS_BTREE_ROOT_NCHILDREN_MIN : NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); } static inline int -nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return nilfs_btree_node_root(btree, node) ? + return nilfs_btree_node_root(node) ? NILFS_BTREE_ROOT_NCHILDREN_MAX : NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); } static inline __le64 * -nilfs_btree_node_dkeys(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) { return (__le64 *)((char *)(node + 1) + - (nilfs_btree_node_root(btree, node) ? + (nilfs_btree_node_root(node) ? 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); } static inline __le64 * -nilfs_btree_node_dptrs(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + - nilfs_btree_node_nchildren_max(btree, node)); + return (__le64 *)(nilfs_btree_node_dkeys(node) + + nilfs_btree_node_nchildren_max(node, btree)); } static inline __u64 -nilfs_btree_node_get_key(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, int index) +nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + - index)); + return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index)); } static inline void -nilfs_btree_node_set_key(struct nilfs_btree *btree, - struct nilfs_btree_node *node, int index, __u64 key) +nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) { - *(nilfs_btree_node_dkeys(btree, node) + index) = - nilfs_bmap_key_to_dkey(key); + *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key); } static inline __u64 nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, - int index) + const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + + return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) + index)); } static inline void nilfs_btree_node_set_ptr(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int index, - __u64 ptr) + struct nilfs_btree_node *node, int index, __u64 ptr) { - *(nilfs_btree_node_dptrs(btree, node) + index) = + *(nilfs_btree_node_dptrs(node, btree) + index) = nilfs_bmap_ptr_to_dptr(ptr); } @@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree, __le64 *dptrs; int i; - nilfs_btree_node_set_flags(btree, node, flags); - nilfs_btree_node_set_level(btree, node, level); - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_flags(node, flags); + nilfs_btree_node_set_level(node, level); + nilfs_btree_node_set_nchildren(node, nchildren); - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); for (i = 0; i < nchildren; i++) { dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); @@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; - ldkeys = nilfs_btree_node_dkeys(btree, left); - ldptrs = nilfs_btree_node_dptrs(btree, left); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, btree); + lnchildren = nilfs_btree_node_get_nchildren(left); - rdkeys = nilfs_btree_node_dkeys(btree, right); - rdptrs = nilfs_btree_node_dptrs(btree, right); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, btree); + rnchildren = nilfs_btree_node_get_nchildren(right); memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); @@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, lnchildren += n; rnchildren -= n; - nilfs_btree_node_set_nchildren(btree, left, lnchildren); - nilfs_btree_node_set_nchildren(btree, right, rnchildren); + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer heads corresponding to left and right are locked. */ @@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; - ldkeys = nilfs_btree_node_dkeys(btree, left); - ldptrs = nilfs_btree_node_dptrs(btree, left); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, btree); + lnchildren = nilfs_btree_node_get_nchildren(left); - rdkeys = nilfs_btree_node_dkeys(btree, right); - rdptrs = nilfs_btree_node_dptrs(btree, right); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, btree); + rnchildren = nilfs_btree_node_get_nchildren(right); memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); @@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, lnchildren -= n; rnchildren += n; - nilfs_btree_node_set_nchildren(btree, left, lnchildren); - nilfs_btree_node_set_nchildren(btree, right, rnchildren); + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer head corresponding to node is locked. */ @@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, __le64 *dptrs; int nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); + nchildren = nilfs_btree_node_get_nchildren(node); if (index < nchildren) { memmove(dkeys + index + 1, dkeys + index, (nchildren - index) * sizeof(*dkeys)); @@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, dkeys[index] = nilfs_bmap_key_to_dkey(key); dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); nchildren++; - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_nchildren(node, nchildren); } /* Assume that the buffer head corresponding to node is locked. */ @@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, __le64 *dptrs; int nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); key = nilfs_bmap_dkey_to_key(dkeys[index]); ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + nchildren = nilfs_btree_node_get_nchildren(node); if (keyp != NULL) *keyp = key; if (ptrp != NULL) @@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, (nchildren - index - 1) * sizeof(*dptrs)); } nchildren--; - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_nchildren(node, nchildren); } -static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, +static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, __u64 key, int *indexp) { __u64 nkey; @@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, /* binary search */ low = 0; - high = nilfs_btree_node_get_nchildren(btree, node) - 1; + high = nilfs_btree_node_get_nchildren(node) - 1; index = 0; s = 0; while (low <= high) { index = (low + high) / 2; - nkey = nilfs_btree_node_get_key(btree, node, index); + nkey = nilfs_btree_node_get_key(node, index); if (nkey == key) { s = 0; goto out; @@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, } /* adjust index */ - if (nilfs_btree_node_get_level(btree, node) > - NILFS_BTREE_LEVEL_NODE_MIN) { - if ((s > 0) && (index > 0)) + if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) { + if (s > 0 && index > 0) index--; } else if (s < 0) index++; @@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree) } static inline struct nilfs_btree_node * -nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, - const struct nilfs_btree_path *path, - int level) +nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_bh->b_data; } static inline struct nilfs_btree_node * -nilfs_btree_get_sib_node(const struct nilfs_btree *btree, - const struct nilfs_btree_path *path, - int level) +nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; } static inline int nilfs_btree_height(const struct nilfs_btree *btree) { - return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) - + 1; + return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; } static inline struct nilfs_btree_node * @@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree, { return (level == nilfs_btree_height(btree) - 1) ? nilfs_btree_get_root(btree) : - nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_get_nonroot_node(path, level); } static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, @@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, int level, index, found, ret; node = nilfs_btree_get_root(btree); - level = nilfs_btree_node_get_level(btree, node); - if ((level < minlevel) || - (nilfs_btree_node_get_nchildren(btree, node) <= 0)) + level = nilfs_btree_node_get_level(node); + if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0) return -ENOENT; - found = nilfs_btree_node_lookup(btree, node, key, &index); + found = nilfs_btree_node_lookup(node, key, &index); ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_bh = NULL; path[level].bp_index = index; @@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; - node = nilfs_btree_get_nonroot_node(btree, path, level); - BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + node = nilfs_btree_get_nonroot_node(path, level); + BUG_ON(level != nilfs_btree_node_get_level(node)); if (!found) - found = nilfs_btree_node_lookup(btree, node, key, - &index); + found = nilfs_btree_node_lookup(node, key, &index); else index = 0; - if (index < nilfs_btree_node_nchildren_max(btree, node)) + if (index < nilfs_btree_node_nchildren_max(node, btree)) ptr = nilfs_btree_node_get_ptr(btree, node, index); else { WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); @@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, int index, level, ret; node = nilfs_btree_get_root(btree); - index = nilfs_btree_node_get_nchildren(btree, node) - 1; + index = nilfs_btree_node_get_nchildren(node) - 1; if (index < 0) return -ENOENT; - level = nilfs_btree_node_get_level(btree, node); + level = nilfs_btree_node_get_level(node); ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_bh = NULL; path[level].bp_index = index; @@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; - node = nilfs_btree_get_nonroot_node(btree, path, level); - BUG_ON(level != nilfs_btree_node_get_level(btree, node)); - index = nilfs_btree_node_get_nchildren(btree, node) - 1; + node = nilfs_btree_get_nonroot_node(path, level); + BUG_ON(level != nilfs_btree_node_get_level(node)); + index = nilfs_btree_node_get_nchildren(node) - 1; ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_index = index; } if (keyp != NULL) - *keyp = nilfs_btree_node_get_key(btree, node, index); + *keyp = nilfs_btree_node_get_key(node, index); if (ptrp != NULL) *ptrp = ptr; @@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ptrp != NULL) *ptrp = ptr; - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, int level = NILFS_BTREE_LEVEL_NODE_MIN; int ret, cnt, index, maxlevel; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ret < 0) goto out; @@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, node = nilfs_btree_get_node(btree, path, level); index = path[level].bp_index + 1; for (;;) { - while (index < nilfs_btree_node_get_nchildren(btree, node)) { - if (nilfs_btree_node_get_key(btree, node, index) != + while (index < nilfs_btree_node_get_nchildren(node)) { + if (nilfs_btree_node_get_key(node, index) != key + cnt) goto end; ptr2 = nilfs_btree_node_get_ptr(btree, node, index); @@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, /* look-up right sibling node */ node = nilfs_btree_get_node(btree, path, level + 1); index = path[level + 1].bp_index + 1; - if (index >= nilfs_btree_node_get_nchildren(btree, node) || - nilfs_btree_node_get_key(btree, node, index) != key + cnt) + if (index >= nilfs_btree_node_get_nchildren(node) || + nilfs_btree_node_get_key(node, index) != key + cnt) break; ptr2 = nilfs_btree_node_get_ptr(btree, node, index); path[level + 1].bp_index = index; @@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); if (ret < 0) goto out; - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); index = 0; path[level].bp_index = index; } @@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, *ptrp = ptr; ret = cnt; out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, do { lock_buffer(path[level].bp_bh); nilfs_btree_node_set_key( - btree, - nilfs_btree_get_nonroot_node( - btree, path, level), + nilfs_btree_get_nonroot_node(path, level), path[level].bp_index, key); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, /* root */ if (level == nilfs_btree_height(btree) - 1) { - nilfs_btree_node_set_key(btree, - nilfs_btree_get_root(btree), + nilfs_btree_node_set_key(nilfs_btree_get_root(btree), path[level].bp_index, key); } } @@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, if (level < nilfs_btree_height(btree) - 1) { lock_buffer(path[level].bp_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) @@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key( - btree, node, 0)); + nilfs_btree_node_get_key(node, + 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, @@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); move = 0; n = (nchildren + lnchildren + 1) / 2 - lnchildren; @@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, unlock_buffer(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); if (move) { brelse(path[level].bp_bh); @@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); move = 0; n = (nchildren + rnchildren + 1) / 2 - rnchildren; @@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, right, 0)); + nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; if (move) { brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; - path[level].bp_index -= - nilfs_btree_node_get_nchildren(btree, node); + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); path[level + 1].bp_index++; } else { brelse(path[level].bp_sib_bh); @@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); move = 0; n = (nchildren + 1) / 2; @@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); unlock_buffer(path[level].bp_sib_bh); - newkey = nilfs_btree_node_get_key(btree, right, 0); + newkey = nilfs_btree_node_get_key(right, 0); newptr = path[level].bp_newreq.bpr_ptr; if (move) { - path[level].bp_index -= - nilfs_btree_node_get_nchildren(btree, node); + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); nilfs_btree_node_insert(btree, right, *keyp, *ptrp, path[level].bp_index); - *keyp = nilfs_btree_node_get_key(btree, right, 0); + *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_bh); @@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree, } else { nilfs_btree_do_insert(btree, path, level, keyp, ptrp); - *keyp = nilfs_btree_node_get_key(btree, right, 0); + *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_sib_bh); @@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, lock_buffer(path[level].bp_sib_bh); root = nilfs_btree_get_root(btree); - child = nilfs_btree_get_sib_node(btree, path, level); + child = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, root); + n = nilfs_btree_node_get_nchildren(root); nilfs_btree_node_move_right(btree, root, child, n); - nilfs_btree_node_set_level(btree, root, level + 1); + nilfs_btree_node_set_level(root, level + 1); if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); @@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, nilfs_btree_do_insert(btree, path, level, keyp, ptrp); - *keyp = nilfs_btree_node_get_key(btree, child, 0); + *keyp = nilfs_btree_node_get_key(child, 0); *ptrp = path[level].bp_newreq.bpr_ptr; } @@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; int pindex, level, ret; + struct inode *dat = NULL; stats->bs_nblocks = 0; level = NILFS_BTREE_LEVEL_DATA; /* allocate a new ptr for data block */ - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { path[level].bp_newreq.bpr_ptr = nilfs_btree_find_target_v(btree, path, key); + dat = nilfs_bmap_get_dat(&btree->bt_bmap); + } ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_data; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { - node = nilfs_btree_get_nonroot_node(btree, path, level); - if (nilfs_btree_node_get_nchildren(btree, node) < - nilfs_btree_node_nchildren_max(btree, node)) { + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_node_get_nchildren(node) < + nilfs_btree_node_nchildren_max(node, btree)) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) < - nilfs_btree_node_nchildren_max(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) < + nilfs_btree_node_nchildren_max(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_left; stats->bs_nblocks++; @@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* right sibling */ if (pindex < - nilfs_btree_node_get_nchildren(btree, parent) - 1) { + nilfs_btree_node_get_nchildren(parent) - 1) { sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) < - nilfs_btree_node_nchildren_max(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) < + nilfs_btree_node_nchildren_max(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_right; stats->bs_nblocks++; @@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, @@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* root */ node = nilfs_btree_get_root(btree); - if (nilfs_btree_node_get_nchildren(btree, node) < - nilfs_btree_node_nchildren_max(btree, node)) { + if (nilfs_btree_node_get_nchildren(node) < + nilfs_btree_node_nchildren_max(node, btree)) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* grow */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, @@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, + dat); err_out_child_node: for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { nilfs_btnode_delete(path[level].bp_sib_bh); nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); } - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, + dat); err_out_data: *levelp = level; stats->bs_nblocks = 0; @@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree, struct nilfs_btree_path *path, int maxlevel, __u64 key, __u64 ptr) { + struct inode *dat = NULL; int level; set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { nilfs_btree_set_target_v(btree, key, ptr); + dat = nilfs_bmap_get_dat(&btree->bt_bmap); + } for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, - &path[level - 1].bp_newreq); + &path[level - 1].bp_newreq, dat); path[level].bp_op(btree, path, level, &key, &ptr); } @@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); @@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, if (level < nilfs_btree_height(btree) - 1) { lock_buffer(path[level].bp_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, node, keyp, ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) @@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_delete(btree, node, keyp, ptrp, @@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); n = (nchildren + lnchildren) / 2 - nchildren; @@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, unlock_buffer(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; @@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); n = (nchildren + rnchildren) / 2 - nchildren; @@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, right, 0)); + nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; brelse(path[level].bp_sib_bh); @@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, node); + n = nilfs_btree_node_get_nchildren(node); nilfs_btree_node_move_left(btree, left, node, n); @@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; - path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); + path[level].bp_index += nilfs_btree_node_get_nchildren(left); } static void nilfs_btree_concat_right(struct nilfs_btree *btree, @@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, right); + n = nilfs_btree_node_get_nchildren(right); nilfs_btree_node_move_left(btree, node, right, n); @@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); root = nilfs_btree_get_root(btree); - child = nilfs_btree_get_nonroot_node(btree, path, level); + child = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, root, NULL, NULL, 0); - nilfs_btree_node_set_level(btree, root, level); - n = nilfs_btree_node_get_nchildren(btree, child); + nilfs_btree_node_set_level(root, level); + n = nilfs_btree_node_get_nchildren(child); nilfs_btree_node_move_left(btree, root, child, n); unlock_buffer(path[level].bp_bh); @@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, struct nilfs_btree_path *path, int *levelp, - struct nilfs_bmap_stats *stats) + struct nilfs_bmap_stats *stats, + struct inode *dat) { struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; @@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; - if (nilfs_btree_node_get_nchildren(btree, node) > - nilfs_btree_node_nchildren_min(btree, node)) { + if (nilfs_btree_node_get_nchildren(node) > + nilfs_btree_node_nchildren_min(node, btree)) { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; goto out; @@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) > - nilfs_btree_node_nchildren_min(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) > + nilfs_btree_node_nchildren_min(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_left; stats->bs_nblocks++; @@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* continue; */ } } else if (pindex < - nilfs_btree_node_get_nchildren(btree, parent) - 1) { + nilfs_btree_node_get_nchildren(parent) - 1) { /* right sibling */ sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); @@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) > - nilfs_btree_node_nchildren_min(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) > + nilfs_btree_node_nchildren_min(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_right; stats->bs_nblocks++; @@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* no siblings */ /* the only child of the root node */ WARN_ON(level != nilfs_btree_height(btree) - 2); - if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= + if (nilfs_btree_node_get_nchildren(node) - 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_shrink; stats->bs_nblocks += 2; @@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; @@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq); + nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat); err_out_child_node: for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { brelse(path[level].bp_sib_bh); nilfs_bmap_abort_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); } *levelp = level; stats->bs_nblocks = 0; @@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, static void nilfs_btree_commit_delete(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int maxlevel) + int maxlevel, struct inode *dat) { int level; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); path[level].bp_op(btree, path, level, NULL, NULL); } @@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; + struct inode *dat; int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); if (ret < 0) goto out; - ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); + + dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ? + nilfs_bmap_get_dat(&btree->bt_bmap) : NULL; + + ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); if (ret < 0) goto out; - nilfs_btree_commit_delete(btree, path, level); + nilfs_btree_commit_delete(btree, path, level, dat); nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) node = root; break; case 3: - nchildren = nilfs_btree_node_get_nchildren(btree, root); + nchildren = nilfs_btree_node_get_nchildren(root); if (nchildren > 1) return 0; ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); @@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) return 0; } - nchildren = nilfs_btree_node_get_nchildren(btree, node); - maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); + nchildren = nilfs_btree_node_get_nchildren(node); + maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? - nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; + nilfs_btree_node_get_key(node, nchildren - 2) : 0; if (bh != NULL) brelse(bh); @@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, node = root; break; case 3: - nchildren = nilfs_btree_node_get_nchildren(btree, root); + nchildren = nilfs_btree_node_get_nchildren(root); WARN_ON(nchildren > 1); ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); ret = nilfs_btree_get_block(btree, ptr, &bh); @@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, return -EINVAL; } - nchildren = nilfs_btree_node_get_nchildren(btree, node); + nchildren = nilfs_btree_node_get_nchildren(node); if (nchildren < nitems) nitems = nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); for (i = 0; i < nitems; i++) { keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); @@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, struct nilfs_bmap_stats *stats) { struct buffer_head *bh; - struct nilfs_btree *btree; + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; + struct inode *dat = NULL; int ret; - btree = (struct nilfs_btree *)bmap; stats->bs_nblocks = 0; /* for data */ /* cannot find near ptr */ - if (NILFS_BMAP_USE_VBN(bmap)) + if (NILFS_BMAP_USE_VBN(bmap)) { dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); + dat = nilfs_bmap_get_dat(bmap); + } - ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq); + ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat); if (ret < 0) return ret; @@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, stats->bs_nblocks++; if (nreq != NULL) { nreq->bpr_ptr = dreq->bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq); + ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat); if (ret < 0) goto err_out_dreq; @@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* error */ err_out_nreq: - nilfs_bmap_abort_alloc_ptr(bmap, nreq); + nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat); err_out_dreq: - nilfs_bmap_abort_alloc_ptr(bmap, dreq); + nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat); stats->bs_nblocks = 0; return ret; @@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *nreq, struct buffer_head *bh) { - struct nilfs_btree *btree; + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct nilfs_btree_node *node; + struct inode *dat; __u64 tmpptr; /* free resources */ @@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); /* convert and insert */ - btree = (struct nilfs_btree *)bmap; + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; nilfs_btree_init(bmap); if (nreq != NULL) { - nilfs_bmap_commit_alloc_ptr(bmap, dreq); - nilfs_bmap_commit_alloc_ptr(bmap, nreq); + nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); + nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); /* create child node at level 1 */ lock_buffer(bh); @@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 2, 1, &keys[0], &tmpptr); } else { - nilfs_bmap_commit_alloc_ptr(bmap, dreq); + nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); /* create root node at level 1 */ node = nilfs_btree_get_root(btree); @@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree, static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { struct nilfs_btree_node *parent; int ret; @@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; - ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); if (ret < 0) return ret; @@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, &path[level].bp_ctxt); if (ret < 0) { - nilfs_bmap_abort_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_abort_update(dat, + &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); return ret; } } @@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { struct nilfs_btree_node *parent; - nilfs_bmap_commit_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req, + btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS); if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( @@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { - nilfs_bmap_abort_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, @@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int minlevel, - int *maxlevelp) + int minlevel, int *maxlevelp, + struct inode *dat) { int level, ret; level = minlevel; if (!buffer_nilfs_volatile(path[level].bp_bh)) { - ret = nilfs_btree_prepare_update_v(btree, path, level); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) return ret; } @@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, !buffer_dirty(path[level].bp_bh)) { WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); - ret = nilfs_btree_prepare_update_v(btree, path, level); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) goto out; } @@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, /* error */ out: while (--level > minlevel) - nilfs_btree_abort_update_v(btree, path, level); + nilfs_btree_abort_update_v(btree, path, level, dat); if (!buffer_nilfs_volatile(path[level].bp_bh)) - nilfs_btree_abort_update_v(btree, path, level); + nilfs_btree_abort_update_v(btree, path, level, dat); return ret; } static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int minlevel, - int maxlevel, - struct buffer_head *bh) + int minlevel, int maxlevel, + struct buffer_head *bh, + struct inode *dat) { int level; if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) - nilfs_btree_commit_update_v(btree, path, minlevel); + nilfs_btree_commit_update_v(btree, path, minlevel, dat); for (level = minlevel + 1; level <= maxlevel; level++) - nilfs_btree_commit_update_v(btree, path, level); + nilfs_btree_commit_update_v(btree, path, level, dat); } static int nilfs_btree_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level, - struct buffer_head *bh) + int level, struct buffer_head *bh) { int maxlevel, ret; struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); __u64 ptr; get_bh(bh); path[level].bp_bh = bh; - ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); + ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel, + dat); if (ret < 0) goto out; @@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree, parent = nilfs_btree_get_node(btree, path, level + 1); ptr = nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); - ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); + ret = nilfs_dat_mark_dirty(dat, ptr); if (ret < 0) goto out; } - nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); + nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat); out: brelse(path[level].bp_bh); @@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, WARN_ON(!buffer_dirty(bh)); btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); if (buffer_nilfs_node(bh)) { node = (struct nilfs_btree_node *)bh->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(bmap, bh); level = NILFS_BTREE_LEVEL_DATA; @@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, nilfs_btree_propagate_p(btree, path, level, bh); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, struct buffer_head *bh) { - return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr); } static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, @@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, get_bh(bh); node = (struct nilfs_btree_node *)bh->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); list_for_each(head, &lists[level]) { cbh = list_entry(head, struct buffer_head, b_assoc_buffers); cnode = (struct nilfs_btree_node *)cbh->b_data; - ckey = nilfs_btree_node_get_key(btree, cnode, 0); + ckey = nilfs_btree_node_get_key(cnode, 0); if (key < ckey) break; } @@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree, nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, blocknr); - key = nilfs_btree_node_get_key(btree, parent, - path[level + 1].bp_index); + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); binfo->bi_dat.bi_level = level; @@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, union nilfs_binfo *binfo) { struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); __u64 key; __u64 ptr; union nilfs_bmap_ptr_req req; @@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, ptr = nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); req.bpr_ptr = ptr; - ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr); - if (unlikely(ret < 0)) + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (ret < 0) return ret; + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); - key = nilfs_btree_node_get_key(btree, parent, - path[level + 1].bp_index); + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); @@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(bmap, *bh); level = NILFS_BTREE_LEVEL_DATA; @@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, sector_t blocknr, union nilfs_binfo *binfo) { - struct nilfs_btree *btree; struct nilfs_btree_node *node; __u64 key; int ret; - btree = (struct nilfs_btree *)bmap; - ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); + ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr, + blocknr); if (ret < 0) return ret; if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); + key = nilfs_btree_node_get_key(node, 0); } else key = nilfs_bmap_data_get_key(bmap, *bh); @@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); if (ret < 0) { @@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) nilfs_bmap_set_dirty(&btree->bt_bmap); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index aec942cf79e..1c6cfb59128 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) void *kaddr; int ret; - if (cno == 0) - return -ENOENT; /* checkpoint number 0 is invalid */ + /* CP number is invalid if it's zero or larger than the + largest exist one.*/ + if (cno == 0 || cno >= nilfs_mdt_cno(cpfile)) + return -ENOENT; down_read(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); @@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) goto out; kaddr = kmap_atomic(bh->b_page, KM_USER0); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); - ret = nilfs_checkpoint_snapshot(cp); + if (nilfs_checkpoint_invalid(cp)) + ret = -ENOENT; + else + ret = nilfs_checkpoint_snapshot(cp); kunmap_atomic(kaddr, KM_USER0); brelse(bh); diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 788a4595019..debea896e70 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -27,8 +27,6 @@ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> -#define NILFS_CPFILE_GFP NILFS_MDT_GFP - int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, struct nilfs_checkpoint **, diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 8927ca27e6f..1ff8e15bd36 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) nilfs_palloc_commit_free_entry(dat, req); } -void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req) -{ - nilfs_dat_abort_entry(dat, req); - nilfs_palloc_abort_free_entry(dat, req); -} - int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) { int ret; @@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, nilfs_dat_commit_entry(dat, req); } -void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req) -{ - nilfs_dat_abort_entry(dat, req); -} - int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; @@ -222,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) nilfs_dat_abort_entry(dat, req); } +int nilfs_dat_prepare_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + int ret; + + ret = nilfs_dat_prepare_end(dat, oldreq); + if (!ret) { + ret = nilfs_dat_prepare_alloc(dat, newreq); + if (ret < 0) + nilfs_dat_abort_end(dat, oldreq); + } + return ret; +} + +void nilfs_dat_commit_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq, int dead) +{ + nilfs_dat_commit_end(dat, oldreq, dead); + nilfs_dat_commit_alloc(dat, newreq); +} + +void nilfs_dat_abort_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + nilfs_dat_abort_end(dat, oldreq); + nilfs_dat_abort_alloc(dat, newreq); +} + /** * nilfs_dat_mark_dirty - * @dat: DAT file inode diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index d328b81eead..406070d3ff4 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -27,7 +27,6 @@ #include <linux/buffer_head.h> #include <linux/fs.h> -#define NILFS_DAT_GFP NILFS_MDT_GFP struct nilfs_palloc_req; @@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *); int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, sector_t); -void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *); int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); +void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *, int); +void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); int nilfs_dat_mark_dirty(struct inode *, __u64); int nilfs_dat_freev(struct inode *, __u64 *, size_t); diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 342d9765df8..d369ac71827 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct, direct->d_bmap.b_last_allocated_ptr = ptr; } -static int nilfs_direct_prepare_insert(struct nilfs_direct *direct, - __u64 key, - union nilfs_bmap_ptr_req *req, - struct nilfs_bmap_stats *stats) -{ - int ret; - - if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) - req->bpr_ptr = nilfs_direct_find_target_v(direct, key); - ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req); - if (ret < 0) - return ret; - - stats->bs_nblocks = 1; - return 0; -} - -static void nilfs_direct_commit_insert(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key, __u64 ptr) -{ - struct buffer_head *bh; - - /* ptr must be a pointer to a buffer head. */ - bh = (struct buffer_head *)((unsigned long)ptr); - set_buffer_nilfs_volatile(bh); - - nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req); - nilfs_direct_set_ptr(direct, key, req->bpr_ptr); - - if (!nilfs_bmap_dirty(&direct->d_bmap)) - nilfs_bmap_set_dirty(&direct->d_bmap); - - if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) - nilfs_direct_set_target_v(direct, key, req->bpr_ptr); -} - static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { - struct nilfs_direct *direct; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; - struct nilfs_bmap_stats stats; + struct inode *dat = NULL; + struct buffer_head *bh; int ret; - direct = (struct nilfs_direct *)bmap; if (key > NILFS_DIRECT_KEY_MAX) return -ENOENT; if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) return -EEXIST; - ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); - if (ret < 0) - return ret; - nilfs_direct_commit_insert(direct, &req, key, ptr); - nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + if (NILFS_BMAP_USE_VBN(bmap)) { + req.bpr_ptr = nilfs_direct_find_target_v(direct, key); + dat = nilfs_bmap_get_dat(bmap); + } + ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); + if (!ret) { + /* ptr must be a pointer to a buffer head. */ + bh = (struct buffer_head *)((unsigned long)ptr); + set_buffer_nilfs_volatile(bh); - return 0; -} + nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(direct, key, req.bpr_ptr); -static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key, - struct nilfs_bmap_stats *stats) -{ - int ret; + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); - req->bpr_ptr = nilfs_direct_get_ptr(direct, key); - ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req); - if (!ret) - stats->bs_nblocks = 1; - return ret; -} + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_direct_set_target_v(direct, key, req.bpr_ptr); -static void nilfs_direct_commit_delete(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key) -{ - nilfs_bmap_commit_end_ptr(&direct->d_bmap, req); - nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_bmap_add_blocks(bmap, 1); + } + return ret; } static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) { - struct nilfs_direct *direct; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; - struct nilfs_bmap_stats stats; + struct inode *dat; int ret; - direct = (struct nilfs_direct *)bmap; - if ((key > NILFS_DIRECT_KEY_MAX) || + if (key > NILFS_DIRECT_KEY_MAX || nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) return -ENOENT; - ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); - if (ret < 0) - return ret; - nilfs_direct_commit_delete(direct, &req, key); - nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; + req.bpr_ptr = nilfs_direct_get_ptr(direct, key); - return 0; + ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); + if (!ret) { + nilfs_bmap_commit_end_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_bmap_sub_blocks(bmap, 1); + } + return ret; } static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) @@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, return 0; } -static int nilfs_direct_propagate_v(struct nilfs_direct *direct, - struct buffer_head *bh) +static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) { - union nilfs_bmap_ptr_req oldreq, newreq; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; + struct nilfs_palloc_req oldreq, newreq; + struct inode *dat; __u64 key; __u64 ptr; int ret; - key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); + if (!NILFS_BMAP_USE_VBN(bmap)) + return 0; + + dat = nilfs_bmap_get_dat(bmap); + key = nilfs_bmap_data_get_key(bmap, bh); ptr = nilfs_direct_get_ptr(direct, key); if (!buffer_nilfs_volatile(bh)) { - oldreq.bpr_ptr = ptr; - newreq.bpr_ptr = ptr; - ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq, - &newreq); + oldreq.pr_entry_nr = ptr; + newreq.pr_entry_nr = ptr; + ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq); if (ret < 0) return ret; - nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq); + nilfs_dat_commit_update(dat, &oldreq, &newreq, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); set_buffer_nilfs_volatile(bh); - nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); + nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr); } else - ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); + ret = nilfs_dat_mark_dirty(dat, ptr); return ret; } -static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, - struct buffer_head *bh) -{ - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; - - return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_direct_propagate_v(direct, bh) : 0; -} - static int nilfs_direct_assign_v(struct nilfs_direct *direct, __u64 key, __u64 ptr, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { + struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap); union nilfs_bmap_ptr_req req; int ret; req.bpr_ptr = ptr; - ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr); - if (unlikely(ret < 0)) - return ret; - - binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); - - return 0; + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (!ret) { + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + } + return ret; } static int nilfs_direct_assign_p(struct nilfs_direct *direct, diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 5d30a35679b..ecc3ba76db4 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -31,7 +31,6 @@ #include "mdt.h" #include "alloc.h" -#define NILFS_IFILE_GFP NILFS_MDT_GFP static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index fe9d8f2a13f..807e584b163 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); - if (nilfs_read_inode_common(inode, raw_inode)) + err = nilfs_read_inode_common(inode, raw_inode); + if (err) goto failed_unmap; if (S_ISREG(inode->i_mode)) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 6ea5f872e2d..6572ea4bc4d 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, const char *msg; int ret; - ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); - if (ret < 0) { - msg = "cannot read source blocks"; - goto failed; - } - ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]); if (ret < 0) { /* @@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, } } - ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + /* + * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(), + * which will operates an inode list without blocking. + * To protect the list from concurrent operations, + * nilfs_ioctl_move_blocks should be atomic operation. + */ + if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) { + ret = -EBUSY; + goto out_free; + } + + ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); + if (ret < 0) + printk(KERN_ERR "NILFS: GC failed during preparation: " + "cannot read source blocks: err=%d\n", ret); + else + ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + + clear_nilfs_gc_running(nilfs); out_free: while (--n >= 0) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 2dfd47714ae..156bf6091a9 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, goto failed_unlock; err = -EEXIST; - if (buffer_uptodate(bh) || buffer_mapped(bh)) + if (buffer_uptodate(bh)) goto failed_bh; -#if 0 - /* The uptodate flag is not protected by the page lock, but - the mapped flag is. Thus, we don't have to wait the buffer. */ + wait_on_buffer(bh); if (buffer_uptodate(bh)) goto failed_bh; -#endif bh->b_bdev = nilfs->ns_bdev; err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); @@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, int mode, struct buffer_head **out_bh) { struct buffer_head *bh; - unsigned long blknum = 0; + __u64 blknum = 0; int ret = -ENOMEM; bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); @@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, unlock_buffer(bh); goto out; } - if (!buffer_mapped(bh)) { /* unused buffer */ - ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, - &blknum); - if (unlikely(ret)) { - unlock_buffer(bh); - goto failed_bh; - } - bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; - bh->b_blocknr = blknum; - set_buffer_mapped(bh); + + ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum); + if (unlikely(ret)) { + unlock_buffer(bh); + goto failed_bh; } + bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; + bh->b_blocknr = (sector_t)blknum; + set_buffer_mapped(bh); bh->b_end_io = end_buffer_read_sync; get_bh(bh); @@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) struct inode *inode = container_of(page->mapping, struct inode, i_data); struct super_block *sb = inode->i_sb; + struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs; struct nilfs_sb_info *writer = NULL; int err = 0; @@ -411,9 +407,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) if (page->mapping->assoc_mapping) return 0; /* Do not request flush for shadow page cache */ if (!sb) { - writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); + down_read(&nilfs->ns_writer_sem); + writer = nilfs->ns_writer; if (!writer) { - nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); + up_read(&nilfs->ns_writer_sem); return -EROFS; } sb = writer->s_super; @@ -425,7 +422,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) nilfs_flush_segment(sb, inode->i_ino); if (writer) - nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); + up_read(&nilfs->ns_writer_sem); return err; } @@ -516,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, } struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, - ino_t ino, gfp_t gfp_mask) + ino_t ino) { - struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); + struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, + NILFS_MDT_GFP); if (!inode) return NULL; diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index df683e0bca6..431599733c9 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long); int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); int nilfs_mdt_fetch_dirty(struct inode *); -struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, - gfp_t); +struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t); struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, ino_t, gfp_t); void nilfs_mdt_destroy(struct inode *); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index d80cc71be74..6dc83591d11 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi, printk(KERN_WARNING "NILFS warning: error recovering data block " "(err=%d, ino=%lu, block-offset=%llu)\n", - err, rb->ino, (unsigned long long)rb->blkoff); + err, (unsigned long)rb->ino, + (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 9e3fe17bb96..e6d9e37fa24 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, { struct bio *bio; - bio = bio_alloc(GFP_NOWAIT, nr_vecs); + bio = bio_alloc(GFP_NOIO, nr_vecs); if (bio == NULL) { while (!bio && (nr_vecs >>= 1)) - bio = bio_alloc(GFP_NOWAIT, nr_vecs); + bio = bio_alloc(GFP_NOIO, nr_vecs); } if (likely(bio)) { bio->bi_bdev = sb->s_bdev; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 51ff3d0a4ee..683df89dbae 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2501,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && nilfs_discontinued(nilfs)) { down_write(&nilfs->ns_sem); - req->sb_err = nilfs_commit_super(sbi, 0); + req->sb_err = nilfs_commit_super(sbi, + nilfs_altsb_need_update(nilfs)); up_write(&nilfs->ns_sem); } } @@ -2689,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg) } else { DEFINE_WAIT(wait); int should_sleep = 1; + struct the_nilfs *nilfs; prepare_to_wait(&sci->sc_wait_daemon, &wait, TASK_INTERRUPTIBLE); @@ -2709,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg) finish_wait(&sci->sc_wait_daemon, &wait); timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && time_after_eq(jiffies, sci->sc_timer->expires)); + nilfs = sci->sc_sbi->s_nilfs; + if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); } goto loop; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index a2c4d76c336..0e99e5c0bd0 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -28,7 +28,6 @@ #include <linux/nilfs2_fs.h> #include "mdt.h" -#define NILFS_SUFILE_GFP NILFS_MDT_GFP static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8e2ec43b18f..55f3d6b6073 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -50,6 +50,8 @@ #include <linux/writeback.h> #include <linux/kobject.h> #include <linux/exportfs.h> +#include <linux/seq_file.h> +#include <linux/mount.h> #include "nilfs.h" #include "mdt.h" #include "alloc.h" @@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); -static void nilfs_write_super(struct super_block *sb); static int nilfs_remount(struct super_block *sb, int *flags, char *data); /** @@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb) lock_kernel(); - if (sb->s_dirt) - nilfs_write_super(sb); - nilfs_detach_segment_constructor(sbi); if (!(sb->s_flags & MS_RDONLY)) { @@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb) unlock_kernel(); } -/** - * nilfs_write_super - write super block(s) of NILFS - * @sb: super_block - * - * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and - * clears s_dirt. This function is called in the section protected by - * lock_super(). - * - * The s_dirt flag is managed by each filesystem and we protect it by ns_sem - * of the struct the_nilfs. Lock order must be as follows: - * - * 1. lock_super() - * 2. down_write(&nilfs->ns_sem) - * - * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer - * of the super block (nilfs->ns_sbp[]). - * - * In most cases, VFS functions call lock_super() before calling these - * methods. So we must be careful not to bring on deadlocks when using - * lock_super(); see generic_shutdown_super(), write_super(), and so on. - * - * Note that order of lock_kernel() and lock_super() depends on contexts - * of VFS. We should also note that lock_kernel() can be used in its - * protective section and only the outermost one has an effect. - */ -static void nilfs_write_super(struct super_block *sb) +static int nilfs_sync_fs(struct super_block *sb, int wait) { struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; - - down_write(&nilfs->ns_sem); - if (!(sb->s_flags & MS_RDONLY)) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u64 t = get_seconds(); - int dupsb; - - if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] && - t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) { - up_write(&nilfs->ns_sem); - return; - } - dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; - nilfs_commit_super(sbi, dupsb); - } - sb->s_dirt = 0; - up_write(&nilfs->ns_sem); -} - -static int nilfs_sync_fs(struct super_block *sb, int wait) -{ int err = 0; - nilfs_write_super(sb); - /* This function is called when super block should be written back */ if (wait) err = nilfs_construct_segment(sb); + + down_write(&nilfs->ns_sem); + if (sb->s_dirt) + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + return err; } @@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) list_add(&sbi->s_list, &nilfs->ns_supers); up_write(&nilfs->ns_super_sem); - sbi->s_ifile = nilfs_mdt_new( - nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); + sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO); if (!sbi->s_ifile) return -ENOMEM; @@ -416,8 +371,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) if (unlikely(err)) goto failed; + down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, &bh_cp); + up_read(&nilfs->ns_segctor_sem); if (unlikely(err)) { if (err == -ENOENT || err == -EINVAL) { printk(KERN_ERR @@ -527,6 +484,26 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct super_block *sb = vfs->mnt_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + if (!nilfs_test_opt(sbi, BARRIER)) + seq_printf(seq, ",barrier=off"); + if (nilfs_test_opt(sbi, SNAPSHOT)) + seq_printf(seq, ",cp=%llu", + (unsigned long long int)sbi->s_snapshot_cno); + if (nilfs_test_opt(sbi, ERRORS_RO)) + seq_printf(seq, ",errors=remount-ro"); + if (nilfs_test_opt(sbi, ERRORS_PANIC)) + seq_printf(seq, ",errors=panic"); + if (nilfs_test_opt(sbi, STRICT_ORDER)) + seq_printf(seq, ",order=strict"); + + return 0; +} + static struct super_operations nilfs_sops = { .alloc_inode = nilfs_alloc_inode, .destroy_inode = nilfs_destroy_inode, @@ -536,7 +513,7 @@ static struct super_operations nilfs_sops = { /* .drop_inode = nilfs_drop_inode, */ .delete_inode = nilfs_delete_inode, .put_super = nilfs_put_super, - .write_super = nilfs_write_super, + /* .write_super = nilfs_write_super, */ .sync_fs = nilfs_sync_fs, /* .write_super_lockfs */ /* .unlockfs */ @@ -544,7 +521,7 @@ static struct super_operations nilfs_sops = { .remount_fs = nilfs_remount, .clear_inode = nilfs_clear_inode, /* .umount_begin */ - /* .show_options */ + .show_options = nilfs_show_options }; static struct inode * @@ -814,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, if (sb->s_flags & MS_RDONLY) { if (nilfs_test_opt(sbi, SNAPSHOT)) { + down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, sbi->s_snapshot_cno); - if (err < 0) + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + if (err == -ENOENT) + err = -EINVAL; goto failed_sbi; + } if (!err) { printk(KERN_ERR "NILFS: The specified checkpoint is " @@ -1125,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, */ sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); - if (!sd.cno) - /* trying to get the latest checkpoint. */ - sd.cno = nilfs_last_cno(nilfs); - /* * Get super block instance holding the nilfs_sb_info struct. * A new instance is allocated if no existing mount is present or diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8b888982571..d4168e269c5 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev) nilfs->ns_bdev = bdev; atomic_set(&nilfs->ns_count, 1); - atomic_set(&nilfs->ns_writer_refcount, -1); atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); init_rwsem(&nilfs->ns_super_sem); mutex_init(&nilfs->ns_mount_mutex); - mutex_init(&nilfs->ns_writer_mutex); + init_rwsem(&nilfs->ns_writer_sem); INIT_LIST_HEAD(&nilfs->ns_list); INIT_LIST_HEAD(&nilfs->ns_supers); spin_lock_init(&nilfs->ns_last_segment_lock); @@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, inode_size = nilfs->ns_inode_size; err = -ENOMEM; - nilfs->ns_dat = nilfs_mdt_new( - nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); if (unlikely(!nilfs->ns_dat)) goto failed; - nilfs->ns_gc_dat = nilfs_mdt_new( - nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); if (unlikely(!nilfs->ns_gc_dat)) goto failed_dat; - nilfs->ns_cpfile = nilfs_mdt_new( - nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP); + nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO); if (unlikely(!nilfs->ns_cpfile)) goto failed_gc_dat; - nilfs->ns_sufile = nilfs_mdt_new( - nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP); + nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO); if (unlikely(!nilfs->ns_sufile)) goto failed_cpfile; diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index e8adbffc626..20abd55881e 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -37,6 +37,7 @@ enum { THE_NILFS_LOADED, /* Roll-back/roll-forward has done and the latest checkpoint was loaded */ THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ + THE_NILFS_GC_RUNNING, /* gc process is running */ }; /** @@ -50,8 +51,7 @@ enum { * @ns_sem: semaphore for shared states * @ns_super_sem: semaphore for global operations across super block instances * @ns_mount_mutex: mutex protecting mount process of nilfs - * @ns_writer_mutex: mutex protecting ns_writer attach/detach - * @ns_writer_refcount: number of referrers on ns_writer + * @ns_writer_sem: semaphore protecting ns_writer attach/detach * @ns_current: back pointer to current mount * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data @@ -100,8 +100,7 @@ struct the_nilfs { struct rw_semaphore ns_sem; struct rw_semaphore ns_super_sem; struct mutex ns_mount_mutex; - struct mutex ns_writer_mutex; - atomic_t ns_writer_refcount; + struct rw_semaphore ns_writer_sem; /* * components protected by ns_super_sem @@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \ THE_NILFS_FNS(INIT, init) THE_NILFS_FNS(LOADED, loaded) THE_NILFS_FNS(DISCONTINUED, discontinued) +THE_NILFS_FNS(GC_RUNNING, gc_running) /* Minimum interval of periodical update of superblocks (in seconds) */ #define NILFS_SB_FREQ 10 #define NILFS_ALTSB_FREQ 60 /* spare superblock */ +static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + return t < nilfs->ns_sbwtime[0] || + t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ; +} + +static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + struct nilfs_super_block **sbp = nilfs->ns_sbp; + return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; +} + void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); struct the_nilfs *find_or_create_nilfs(struct block_device *); void put_nilfs(struct the_nilfs *); @@ -221,39 +235,26 @@ static inline void get_nilfs(struct the_nilfs *nilfs) atomic_inc(&nilfs->ns_count); } -static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs) -{ - if (atomic_inc_and_test(&nilfs->ns_writer_refcount)) - mutex_lock(&nilfs->ns_writer_mutex); - return nilfs->ns_writer; -} - -static inline void nilfs_put_writer(struct the_nilfs *nilfs) -{ - if (atomic_add_negative(-1, &nilfs->ns_writer_refcount)) - mutex_unlock(&nilfs->ns_writer_mutex); -} - static inline void nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) { - mutex_lock(&nilfs->ns_writer_mutex); + down_write(&nilfs->ns_writer_sem); nilfs->ns_writer = sbi; - mutex_unlock(&nilfs->ns_writer_mutex); + up_write(&nilfs->ns_writer_sem); } static inline void nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) { - mutex_lock(&nilfs->ns_writer_mutex); + down_write(&nilfs->ns_writer_sem); if (sbi == nilfs->ns_writer) nilfs->ns_writer = NULL; - mutex_unlock(&nilfs->ns_writer_mutex); + up_write(&nilfs->ns_writer_sem); } static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) { - if (!atomic_dec_and_test(&sbi->s_count)) + if (atomic_dec_and_test(&sbi->s_count)) kfree(sbi); } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 47cd258fd24..c9ee67b442e 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev event_priv->wd = wd; ret = fsnotify_add_notify_event(group, event, fsn_event_priv); - /* EEXIST is not an error */ - if (ret == -EEXIST) - ret = 0; - - /* did event_priv get attached? */ - if (list_empty(&fsn_event_priv->event_list)) + if (ret) { inotify_free_event_priv(fsn_event_priv); + /* EEXIST says we tail matched, EOVERFLOW isn't something + * to report up the stack. */ + if ((ret == -EEXIST) || + (ret == -EOVERFLOW)) + ret = 0; + } /* * If we hold the entry until after the event is on the queue @@ -104,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode return send; } +/* + * This is NEVER supposed to be called. Inotify marks should either have been + * removed from the idr when the watch was removed or in the + * fsnotify_destroy_mark_by_group() call when the inotify instance was being + * torn down. This is only called if the idr is about to be freed but there + * are still marks in it. + */ static int idr_callback(int id, void *p, void *data) { - BUG(); + struct fsnotify_mark_entry *entry; + struct inotify_inode_mark_entry *ientry; + static bool warned = false; + + if (warned) + return 0; + + warned = false; + entry = p; + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + + WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in " + "idr. Probably leaking memory\n", id, p, data); + + /* + * I'm taking the liberty of assuming that the mark in question is a + * valid address and I'm dereferencing it. This might help to figure + * out why we got here and the panic is no worse than the original + * BUG() that was here. + */ + if (entry) + printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n", + entry->group, entry->inode, ientry->wd); return 0; } static void inotify_free_group_priv(struct fsnotify_group *group) { /* ideally the idr is empty and we won't hit the BUG in teh callback */ - idr_for_each(&group->inotify_data.idr, idr_callback, NULL); + idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_remove_all(&group->inotify_data.idr); idr_destroy(&group->inotify_data.idr); } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index f30d9bbc2e1..dcd2040d330 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -47,9 +47,6 @@ static struct vfsmount *inotify_mnt __read_mostly; -/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */ -static struct inotify_event nul_inotify_event; - /* these are configurable via /proc/sys/fs/inotify/ */ static int inotify_max_user_instances __read_mostly; static int inotify_max_queued_events __read_mostly; @@ -157,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, event = fsnotify_peek_notify_event(group); - event_size += roundup(event->name_len, event_size); + if (event->name_len) + event_size += roundup(event->name_len + 1, event_size); if (event_size > count) return ERR_PTR(-EINVAL); @@ -183,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event_private_data *fsn_priv; struct inotify_event_private_data *priv; size_t event_size = sizeof(struct inotify_event); - size_t name_len; + size_t name_len = 0; /* we get the inotify watch descriptor from the event private data */ spin_lock(&event->lock); @@ -199,8 +197,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, inotify_free_event_priv(fsn_priv); } - /* round up event->name_len so it is a multiple of event_size */ - name_len = roundup(event->name_len, event_size); + /* + * round up event->name_len so it is a multiple of event_size + * plus an extra byte for the terminating '\0'. + */ + if (event->name_len) + name_len = roundup(event->name_len + 1, event_size); inotify_event.len = name_len; inotify_event.mask = inotify_mask_to_arg(event->mask); @@ -224,8 +226,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, return -EFAULT; buf += event->name_len; - /* fill userspace with 0's from nul_inotify_event */ - if (copy_to_user(buf, &nul_inotify_event, len_to_zero)) + /* fill userspace with 0's */ + if (clear_user(buf, len_to_zero)) return -EFAULT; buf += len_to_zero; event_size += name_len; @@ -326,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, list_for_each_entry(holder, &group->notification_list, event_list) { event = holder->event; send_len += sizeof(struct inotify_event); - send_len += roundup(event->name_len, - sizeof(struct inotify_event)); + if (event->name_len) + send_len += roundup(event->name_len + 1, + sizeof(struct inotify_event)); } mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -364,20 +367,53 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns return error; } +/* + * Remove the mark from the idr (if present) and drop the reference + * on the mark because it was in the idr. + */ static void inotify_remove_from_idr(struct fsnotify_group *group, struct inotify_inode_mark_entry *ientry) { struct idr *idr; + struct fsnotify_mark_entry *entry; + struct inotify_inode_mark_entry *found_ientry; + int wd; spin_lock(&group->inotify_data.idr_lock); idr = &group->inotify_data.idr; - idr_remove(idr, ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); + wd = ientry->wd; + + if (wd == -1) + goto out; + + entry = idr_find(&group->inotify_data.idr, wd); + if (unlikely(!entry)) + goto out; + + found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + if (unlikely(found_ientry != ientry)) { + /* We found an entry in the idr with the right wd, but it's + * not the entry we were told to remove. eparis seriously + * fucked up somewhere. */ + WARN_ON(1); + ientry->wd = -1; + goto out; + } + + /* One ref for being in the idr, one ref held by the caller */ + BUG_ON(atomic_read(&entry->refcnt) < 2); + + idr_remove(idr, wd); ientry->wd = -1; + + /* removed from the idr, drop that ref */ + fsnotify_put_mark(entry); +out: + spin_unlock(&group->inotify_data.idr_lock); } + /* - * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the - * internal reference help on the mark because it is in the idr. + * Send IN_IGNORED for this wd, remove this wd from the idr. */ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) @@ -386,6 +422,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_event *ignored_event; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; + int ret; ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0, @@ -404,10 +441,8 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, fsn_event_priv->group = group; event_priv->wd = ientry->wd; - fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); - - /* did the private data get added? */ - if (list_empty(&fsn_event_priv->event_list)) + ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); + if (ret) inotify_free_event_priv(fsn_event_priv); skip_send_ignore: @@ -418,9 +453,6 @@ skip_send_ignore: /* remove this entry from the idr */ inotify_remove_from_idr(group, ientry); - /* removed from idr, drop that reference */ - fsnotify_put_mark(entry); - atomic_dec(&group->inotify_data.user->inotify_watches); } @@ -432,80 +464,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry) kmem_cache_free(inotify_inode_mark_cachep, ientry); } -static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +static int inotify_update_existing_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) { - struct fsnotify_mark_entry *entry = NULL; + struct fsnotify_mark_entry *entry; struct inotify_inode_mark_entry *ientry; - struct inotify_inode_mark_entry *tmp_ientry; - int ret = 0; - int add = (arg & IN_MASK_ADD); - __u32 mask; __u32 old_mask, new_mask; + __u32 mask; + int add = (arg & IN_MASK_ADD); + int ret; /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); if (unlikely(!mask)) return -EINVAL; - tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); - if (unlikely(!tmp_ientry)) - return -ENOMEM; - /* we set the mask at the end after attaching it */ - fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); - tmp_ientry->wd = -1; - -find_entry: spin_lock(&inode->i_lock); entry = fsnotify_find_mark_entry(group, inode); spin_unlock(&inode->i_lock); - if (entry) { - ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - } else { - ret = -ENOSPC; - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) - goto out_err; -retry: - ret = -ENOMEM; - if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) - goto out_err; - - spin_lock(&group->inotify_data.idr_lock); - ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, - group->inotify_data.last_wd, - &tmp_ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); - if (ret) { - if (ret == -EAGAIN) - goto retry; - goto out_err; - } + if (!entry) + return -ENOENT; - ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); - if (ret) { - inotify_remove_from_idr(group, tmp_ientry); - if (ret == -EEXIST) - goto find_entry; - goto out_err; - } - - /* tmp_ientry has been added to the inode, so we are all set up. - * now we just need to make sure tmp_ientry doesn't get freed and - * we need to set up entry and ientry so the generic code can - * do its thing. */ - ientry = tmp_ientry; - entry = &ientry->fsn_entry; - tmp_ientry = NULL; - - atomic_inc(&group->inotify_data.user->inotify_watches); - - /* update the idr hint */ - group->inotify_data.last_wd = ientry->wd; - - /* we put the mark on the idr, take a reference */ - fsnotify_get_mark(entry); - } - - ret = ientry->wd; + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); spin_lock(&entry->lock); @@ -537,18 +518,107 @@ retry: fsnotify_recalc_group_mask(group); } - /* this either matches fsnotify_find_mark_entry, or init_mark_entry - * depending on which path we took... */ + /* return the wd */ + ret = ientry->wd; + + /* match the get from fsnotify_find_mark_entry() */ fsnotify_put_mark(entry); + return ret; +} + +static int inotify_new_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) +{ + struct inotify_inode_mark_entry *tmp_ientry; + __u32 mask; + int ret; + + /* don't allow invalid bits: we don't want flags set */ + mask = inotify_arg_to_mask(arg); + if (unlikely(!mask)) + return -EINVAL; + + tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_ientry)) + return -ENOMEM; + + fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); + tmp_ientry->fsn_entry.mask = mask; + tmp_ientry->wd = -1; + + ret = -ENOSPC; + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) + goto out_err; +retry: + ret = -ENOMEM; + if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) + goto out_err; + + spin_lock(&group->inotify_data.idr_lock); + ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, + group->inotify_data.last_wd, + &tmp_ientry->wd); + spin_unlock(&group->inotify_data.idr_lock); + if (ret) { + /* idr was out of memory allocate and try again */ + if (ret == -EAGAIN) + goto retry; + goto out_err; + } + + /* we put the mark on the idr, take a reference */ + fsnotify_get_mark(&tmp_ientry->fsn_entry); + + /* we are on the idr, now get on the inode */ + ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); + if (ret) { + /* we failed to get on the inode, get off the idr */ + inotify_remove_from_idr(group, tmp_ientry); + goto out_err; + } + + /* update the idr hint, who cares about races, it's just a hint */ + group->inotify_data.last_wd = tmp_ientry->wd; + + /* increment the number of watches the user has */ + atomic_inc(&group->inotify_data.user->inotify_watches); + + /* return the watch descriptor for this new entry */ + ret = tmp_ientry->wd; + + /* match the ref from fsnotify_init_markentry() */ + fsnotify_put_mark(&tmp_ientry->fsn_entry); + + /* if this mark added a new event update the group mask */ + if (mask & ~group->mask) + fsnotify_recalc_group_mask(group); + out_err: - /* could be an error, could be that we found an existing mark */ - if (tmp_ientry) { - /* on the idr but didn't make it on the inode */ - if (tmp_ientry->wd != -1) - inotify_remove_from_idr(group, tmp_ientry); + if (ret < 0) kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); - } + + return ret; +} + +static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +{ + int ret = 0; + +retry: + /* try to update and existing watch with the new arg */ + ret = inotify_update_existing_watch(group, inode, arg); + /* no mark present, try to add a new one */ + if (ret == -ENOENT) + ret = inotify_new_watch(group, inode, arg); + /* + * inotify_new_watch could race with another thread which did an + * inotify_new_watch between the update_existing and the add watch + * here, go back and try to update an existing mark again. + */ + if (ret == -EEXIST) + goto retry; return ret; } @@ -568,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 0; + group->inotify_data.last_wd = 1; group->inotify_data.user = user; group->inotify_data.fa = NULL; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 521368574e9..3816d5750dd 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -153,6 +153,10 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new return true; break; case (FSNOTIFY_EVENT_NONE): + if (old->mask & FS_Q_OVERFLOW) + return true; + else if (old->mask & FS_IN_IGNORED) + return false; return false; }; } @@ -171,9 +175,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even struct list_head *list = &group->notification_list; struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; - - /* easy to tell if priv was attached to the event */ - INIT_LIST_HEAD(&priv->event_list); + int ret = 0; /* * There is one fsnotify_event_holder embedded inside each fsnotify_event. @@ -194,6 +196,7 @@ alloc_holder: if (group->q_len >= group->max_events) { event = &q_overflow_event; + ret = -EOVERFLOW; /* sorry, no private data on the overflow event */ priv = NULL; } @@ -235,7 +238,7 @@ alloc_holder: mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); - return 0; + return ret; } /* diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f9a3e894266..ab513ddaeff 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6851,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } status = 0; bail: - + brelse(last_eb_bh); mlog_exit(status); return status; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b401654011a..8a1e61545f4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1747,8 +1747,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * we know zeros will only be needed in the first and/or last cluster. */ if (clusters_to_alloc || extents_to_split || - wc->w_desc[0].c_needs_zero || - wc->w_desc[wc->w_clen - 1].c_needs_zero) + (wc->w_clen && (wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero))) cluster_of_pages = 1; else cluster_of_pages = 0; diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 2f28b7de2c8..b4957c7d9fe 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, goto bail; } + /* + * If the last lookup failed to create dentry lock, let us + * redo it. + */ + if (!dentry->d_fsdata) { + mlog(0, "Inode %llu doesn't have dentry lock, " + "returning false\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto bail; + } + ret = 1; bail: diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 1c9efb406a9..02bf17808bd 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -325,6 +325,7 @@ clear_fields: } static struct backing_dev_info dlmfs_backing_dev_info = { + .name = "ocfs2-dlmfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index fcf879ed693..756f5b0998e 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, * that still has AST's pending... */ in_use = !list_empty(&lock->ast_list); spin_unlock(&dlm->ast_lock); - if (in_use) { + if (in_use && !(flags & LKM_CANCEL)) { mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " "while waiting for an ast!", res->lockname.len, res->lockname.name); @@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_IN_PROGRESS) { - if (master_node) { + if (master_node && !(flags & LKM_CANCEL)) { mlog(ML_ERROR, "lockres in progress!\n"); spin_unlock(&res->spinlock); return DLM_FORWARD; diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index fcdba091af3..c212cf5a2bd 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -108,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", [OCFS2_LOCK_TYPE_QINFO] = "Quota", + [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", }; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index bf7742d0ee3..44f2a5e1d04 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -23,6 +23,7 @@ #include "sysfile.h" #include "dlmglue.h" #include "uptodate.h" +#include "super.h" #include "quota.h" static struct workqueue_struct *ocfs2_quota_wq = NULL; @@ -114,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block, int rc = 0; struct buffer_head *tmp = *bh; + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { + ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested " + "to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + return -EIO; + } rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, ocfs2_validate_quota_block); if (rc) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b0ee0fdf799..a3f8871d21f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1218,13 +1218,17 @@ static void ocfs2_kill_sb(struct super_block *sb) { struct ocfs2_super *osb = OCFS2_SB(sb); + /* Failed mount? */ + if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) + goto out; + /* Prevent further queueing of inode drop events */ spin_lock(&dentry_list_lock); ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); spin_unlock(&dentry_list_lock); /* Wait for work to finish and/or remove it */ cancel_work_sync(&osb->dentry_lock_work); - +out: kill_block_super(sb); } diff --git a/fs/open.c b/fs/open.c index dd98e807602..31191bf513e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -199,7 +199,7 @@ out: int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { - int err; + int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ @@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, } /* Remove suid/sgid on truncate too */ - newattrs.ia_valid |= should_remove_suid(dentry); + ret = should_remove_suid(dentry); + if (ret) + newattrs.ia_valid |= ret | ATTR_FORCE; mutex_lock(&dentry->d_inode->i_mutex); - err = notify_change(dentry, &newattrs); + ret = notify_change(dentry, &newattrs); mutex_unlock(&dentry->d_inode->i_mutex); - return err; + return ret; } static long do_sys_truncate(const char __user *pathname, loff_t length) @@ -957,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, int error; struct file *f; + validate_creds(cred); + /* * We must always pass in a valid mount pointer. Historically * callers got away with not passing it, but we must enforce this at diff --git a/fs/proc/base.c b/fs/proc/base.c index 175db258942..6f742f6658a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1003,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - task_lock(task); - if (task->mm) - oom_adjust = task->mm->oom_adj; - else - oom_adjust = OOM_DISABLE; - task_unlock(task); + oom_adjust = task->oomkilladj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1037,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - task_lock(task); - if (!task->mm) { - task_unlock(task); - put_task_struct(task); - return -EINVAL; - } - if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { - task_unlock(task); + if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { put_task_struct(task); return -EACCES; } - task->mm->oom_adj = oom_adjust; - task_unlock(task); + task->oomkilladj = oom_adjust; put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 0ff7566c767..a7f0110fca4 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; static struct backing_dev_info ramfs_backing_dev_info = { + .name = "ramfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | diff --git a/fs/super.c b/fs/super.c index 2761d3e22ed..9cda337ddae 100644 --- a/fs/super.c +++ b/fs/super.c @@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type) s = NULL; goto out; } - INIT_LIST_HEAD(&s->s_dirty); - INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_more_io); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); @@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb) * Drops a temporary reference, frees superblock if there's no * references left. */ -static void put_super(struct super_block *sb) +void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); diff --git a/fs/sync.c b/fs/sync.c index 3422ba61d86..103cc7fdd3d 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -19,20 +19,22 @@ SYNC_FILE_RANGE_WAIT_AFTER) /* - * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) - * just dirties buffers with inodes so we have to submit IO for these buffers - * via __sync_blockdev(). This also speeds up the wait == 1 case since in that - * case write_inode() functions do sync_dirty_buffer() and thus effectively - * write one block at a time. + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to + * submit IO for these buffers via __sync_blockdev(). This also speeds up the + * wait == 1 case since in that case write_inode() functions do + * sync_dirty_buffer() and thus effectively write one block at a time. */ static int __sync_filesystem(struct super_block *sb, int wait) { /* Avoid doing twice syncing and cache pruning for quota sync */ - if (!wait) + if (!wait) { writeout_quota_sb(sb, -1); - else + writeback_inodes_sb(sb); + } else { sync_quota_sb(sb, -1); - sync_inodes_sb(sb, wait); + sync_inodes_sb(sb); + } if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); @@ -118,7 +120,7 @@ restart: */ SYSCALL_DEFINE0(sync) { - wakeup_pdflush(0); + wakeup_flusher_threads(0); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 14f2d71ea3c..0050fc40e8c 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, const struct inode_operations sysfs_dir_inode_operations = { .lookup = sysfs_lookup, .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; static void remove_dir(struct sysfs_dirent *sd) diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 555f0ff988d..e28cecf179f 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -18,6 +18,8 @@ #include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/xattr.h> +#include <linux/security.h> #include "sysfs.h" extern struct super_block * sysfs_sb; @@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = { }; static struct backing_dev_info sysfs_backing_dev_info = { + .name = "sysfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static const struct inode_operations sysfs_inode_operations ={ .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; int __init sysfs_inode_init(void) @@ -42,18 +46,37 @@ int __init sysfs_inode_init(void) return bdi_init(&sysfs_backing_dev_info); } +struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) +{ + struct sysfs_inode_attrs *attrs; + struct iattr *iattrs; + + attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); + if (!attrs) + return NULL; + iattrs = &attrs->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = sd->s_mode; + iattrs->ia_uid = 0; + iattrs->ia_gid = 0; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + return attrs; +} int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) { struct inode * inode = dentry->d_inode; struct sysfs_dirent * sd = dentry->d_fsdata; - struct iattr * sd_iattr; + struct sysfs_inode_attrs *sd_attrs; + struct iattr *iattrs; unsigned int ia_valid = iattr->ia_valid; int error; if (!sd) return -EINVAL; - sd_iattr = sd->s_iattr; + sd_attrs = sd->s_iattr; error = inode_change_ok(inode, iattr); if (error) @@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) if (error) return error; - if (!sd_iattr) { + if (!sd_attrs) { /* setting attributes for the first time, allocate now */ - sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); - if (!sd_iattr) + sd_attrs = sysfs_init_inode_attrs(sd); + if (!sd_attrs) return -ENOMEM; - /* assign default attributes */ - sd_iattr->ia_mode = sd->s_mode; - sd_iattr->ia_uid = 0; - sd_iattr->ia_gid = 0; - sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; - sd->s_iattr = sd_iattr; + sd->s_iattr = sd_attrs; + } else { + /* attributes were changed at least once in past */ + iattrs = &sd_attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + iattrs->ia_mode = sd->s_mode = mode; + } } + return error; +} - /* attributes were changed atleast once in past */ - - if (ia_valid & ATTR_UID) - sd_iattr->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - sd_iattr->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - sd_iattr->ia_mode = sd->s_mode = mode; - } +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + struct sysfs_inode_attrs *iattrs; + void *secdata; + int error; + u32 secdata_len = 0; + + if (!sd) + return -EINVAL; + if (!sd->s_iattr) + sd->s_iattr = sysfs_init_inode_attrs(sd); + if (!sd->s_iattr) + return -ENOMEM; + + iattrs = sd->s_iattr; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + goto out; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + goto out; + if (iattrs->ia_secdata) + security_release_secctx(iattrs->ia_secdata, + iattrs->ia_secdata_len); + iattrs->ia_secdata = secdata; + iattrs->ia_secdata_len = secdata_len; + } else + return -EINVAL; +out: return error; } @@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd) static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; + struct sysfs_inode_attrs *iattrs; inode->i_private = sysfs_get(sd); inode->i_mapping->a_ops = &sysfs_aops; @@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) inode->i_ino = sd->s_ino; lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); - if (sd->s_iattr) { + iattrs = sd->s_iattr; + if (iattrs) { /* sysfs_dirent has non-default attributes * get them for the new inode from persistent copy * in sysfs_dirent */ - set_inode_attr(inode, sd->s_iattr); + set_inode_attr(inode, &iattrs->ia_iattr); + if (iattrs->ia_secdata) + security_inode_notifysecctx(inode, + iattrs->ia_secdata, + iattrs->ia_secdata_len); } else set_default_inode_attr(inode, sd->s_mode); - /* initialize inode according to type */ switch (sysfs_type(sd)) { case SYSFS_DIR: diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 1d897ad808e..c5081ad7702 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -16,6 +16,7 @@ #include <linux/kobject.h> #include <linux/namei.h> #include <linux/mutex.h> +#include <linux/security.h> #include "sysfs.h" @@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co } const struct inode_operations sysfs_symlink_inode_operations = { + .setxattr = sysfs_setxattr, .readlink = generic_readlink, .follow_link = sysfs_follow_link, .put_link = sysfs_put_link, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3fa0d98481e..af4c4e7482a 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,6 +8,8 @@ * This file is released under the GPLv2. */ +#include <linux/fs.h> + struct sysfs_open_dirent; /* type-specific structures for sysfs_dirent->s_* union members */ @@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr { struct hlist_head buffers; }; +struct sysfs_inode_attrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; +}; + /* * sysfs_dirent - the building block of sysfs hierarchy. Each and * every sysfs node is represented by single sysfs_dirent. @@ -56,7 +64,7 @@ struct sysfs_dirent { unsigned int s_flags; ino_t s_ino; umode_t s_mode; - struct iattr *s_iattr; + struct sysfs_inode_attrs *s_iattr; }; #define SD_DEACTIVATED_BIAS INT_MIN @@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) struct inode *sysfs_get_inode(struct sysfs_dirent *sd); void sysfs_delete_inode(struct inode *inode); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); int sysfs_inode_init(void); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index eaf6d891d46..1c8991b0db1 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -65,26 +65,14 @@ static int shrink_liability(struct ubifs_info *c, int nr_to_write) { int nr_written; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .range_end = LLONG_MAX, - .nr_to_write = nr_to_write, - }; - - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; + nr_written = writeback_inodes_sb(c->vfs_sb); if (!nr_written) { /* * Re-try again but wait on pages/inodes which are being * written-back concurrently (e.g., by pdflush). */ - memset(&wbc, 0, sizeof(struct writeback_control)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.range_end = LLONG_MAX; - wbc.nr_to_write = nr_to_write; - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; + nr_written = sync_inodes_sb(c->vfs_sb); } dbg_budg("%d pages were written back", nr_written); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 26d2e0d8046..51763aa8f4d 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -438,12 +438,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) { int i, err; struct ubifs_info *c = sb->s_fs_info; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .range_start = 0, - .range_end = LLONG_MAX, - .nr_to_write = LONG_MAX, - }; /* * Zero @wait is just an advisory thing to help the file system shove @@ -462,7 +456,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) * the user be able to get more accurate results of 'statfs()' after * they synchronize the file system. */ - generic_sync_sb_inodes(sb, &wbc); + sync_inodes_sb(sb); /* * Synchronize write buffers, because 'ubifs_run_commit()' does not @@ -1971,6 +1965,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) * * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ + c->bdi.name = "ubifs", c->bdi.capabilities = BDI_CAP_MAP_COPY; c->bdi.unplug_io_fn = default_unplug_io_fn; err = bdi_init(&c->bdi); diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 1d2c570704c..2ffdb6733af 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -18,59 +18,6 @@ #include <linux/string.h> #include <linux/buffer_head.h> -#if 0 -static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, - uint8_t ad_size, struct kernel_lb_addr fe_loc, - int *pos, int *offset, struct buffer_head **bh, - int *error) -{ - int loffset = *offset; - int block; - uint8_t *ad; - int remainder; - - *error = 0; - - ad = (uint8_t *)(*bh)->b_data + *offset; - *offset += ad_size; - - if (!ad) { - brelse(*bh); - *error = 1; - return NULL; - } - - if (*offset == dir->i_sb->s_blocksize) { - brelse(*bh); - block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos); - if (!block) - return NULL; - *bh = udf_tread(dir->i_sb, block); - if (!*bh) - return NULL; - } else if (*offset > dir->i_sb->s_blocksize) { - ad = tmpad; - - remainder = dir->i_sb->s_blocksize - loffset; - memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder); - - brelse(*bh); - block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos); - if (!block) - return NULL; - (*bh) = udf_tread(dir->i_sb, block); - if (!*bh) - return NULL; - - memcpy((uint8_t *)ad + remainder, (*bh)->b_data, - ad_size - remainder); - *offset = ad_size - remainder; - } - - return ad; -} -#endif - struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, struct udf_fileident_bh *fibh, struct fileIdentDesc *cfi, @@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) return fi; } -#if 0 -static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) -{ - struct extent_ad *ext; - struct fileEntry *fe; - uint8_t *ptr; - - if ((!buffer) || (!offset)) { - printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n"); - return NULL; - } - - fe = (struct fileEntry *)buffer; - - if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) { - udf_debug("0x%x != TAG_IDENT_FE\n", - le16_to_cpu(fe->descTag.tagIdent)); - return NULL; - } - - ptr = (uint8_t *)(fe->extendedAttr) + - le32_to_cpu(fe->lengthExtendedAttr); - - if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) - ptr += *offset; - - ext = (struct extent_ad *)ptr; - - *offset = *offset + sizeof(struct extent_ad); - return ext; -} -#endif - struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) { diff --git a/fs/udf/file.c b/fs/udf/file.c index 7464305382b..b80cbd78833 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) { + mutex_lock(&inode->i_mutex); lock_kernel(); udf_discard_prealloc(inode); unlock_kernel(); + mutex_unlock(&inode->i_mutex); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e7533f78563..6d24c2c63f9 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -90,19 +90,16 @@ no_delete: } /* - * If we are going to release inode from memory, we discard preallocation and - * truncate last inode extent to proper length. We could use drop_inode() but - * it's called under inode_lock and thus we cannot mark inode dirty there. We - * use clear_inode() but we have to make sure to write inode as it's not written - * automatically. + * If we are going to release inode from memory, we truncate last inode extent + * to proper length. We could use drop_inode() but it's called under inode_lock + * and thus we cannot mark inode dirty there. We use clear_inode() but we have + * to make sure to write inode as it's not written automatically. */ void udf_clear_inode(struct inode *inode) { struct udf_inode_info *iinfo; if (!(inode->i_sb->s_flags & MS_RDONLY)) { lock_kernel(); - /* Discard preallocation for directories, symlinks, etc. */ - udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); unlock_kernel(); write_inode_now(inode, 0); @@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); #ifdef UDF_PREALLOCATE - /* preallocate blocks */ - udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); + /* We preallocate blocks only for regular files. It also makes sense + * for directories but there's a problem when to drop the + * preallocation. We might use some delayed work for that but I feel + * it's overengineering for a filesystem like UDF. */ + if (S_ISREG(inode->i_mode)) + udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); #endif /* merge any continuous blocks in laarr */ diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 1b88fd5df05..43e24a3b8e1 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb) ms_info.addr_format = CDROM_LBA; i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); -#define WE_OBEY_THE_WRITTEN_STANDARDS 1 - if (i == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); -#if WE_OBEY_THE_WRITTEN_STANDARDS if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ -#endif vol_desc_start = ms_info.addr.lba; } else { udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 6a29fa34c47..21dad8c608f 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, pc->componentType = 1; pc->lengthComponentIdent = 0; pc->componentFileVersionNum = 0; - pc += sizeof(struct pathComponent); elen += sizeof(struct pathComponent); } diff --git a/fs/xattr.c b/fs/xattr.c index 1c3d0af59dd..6d4f6d3449f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask) return inode_permission(inode, mask); } -int -vfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +/** + * __vfs_setxattr_noperm - perform setxattr operation without performing + * permission checks. + * + * @dentry - object to perform setxattr on + * @name - xattr name to set + * @value - value to set @name to + * @size - size of @value + * @flags - flags to pass into filesystem operations + * + * returns the result of the internal setxattr or setsecurity operations. + * + * This function requires the caller to lock the inode's i_mutex before it + * is executed. It also assumes that the caller will make the appropriate + * permission checks. + */ +int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; - int error; - - error = xattr_permission(inode, name, MAY_WRITE); - if (error) - return error; + int error = -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); - error = security_inode_setxattr(dentry, name, value, size, flags); - if (error) - goto out; - error = -EOPNOTSUPP; if (inode->i_op->setxattr) { error = inode->i_op->setxattr(dentry, name, value, size, flags); if (!error) { @@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!error) fsnotify_xattr(dentry); } + + return error; +} + + +int +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = security_inode_setxattr(dentry, name, value, size, flags); + if (error) + goto out; + + error = __vfs_setxattr_noperm(dentry, name, value, size, flags); + out: mutex_unlock(&inode->i_mutex); return error; diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 0882d166239..eafcc7c1870 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -619,7 +619,7 @@ xfs_file_compat_ioctl( case XFS_IOC_GETVERSION_32: cmd = _NATIVE_IOC(cmd, long); return xfs_file_ioctl(filp, cmd, p); - case XFS_IOC_SWAPEXT: { + case XFS_IOC_SWAPEXT_32: { struct xfs_swapext sxp; struct compat_xfs_swapext __user *sxu = arg; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 8070b34cc28..6c32f1d63d8 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -485,14 +485,6 @@ xfs_vn_put_link( } STATIC int -xfs_vn_permission( - struct inode *inode, - int mask) -{ - return generic_permission(inode, mask, xfs_check_acl); -} - -STATIC int xfs_vn_getattr( struct vfsmount *mnt, struct dentry *dentry, @@ -696,7 +688,7 @@ xfs_vn_fiemap( } static const struct inode_operations xfs_inode_operations = { - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .truncate = xfs_vn_truncate, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -724,7 +716,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -749,7 +741,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -762,7 +754,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index b619d6b8ca4..98ef624d9ba 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -708,6 +708,16 @@ xfs_reclaim_inode( return 0; } +void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); +} + /* * We set the inode flag atomically with the radix tree tag. * Once we get tag lookups on the radix tree, this inode flag @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag( read_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 2a10301c99c..59120602588 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 34ec86923f7..ecbf8b4d2e2 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -191,80 +191,82 @@ xfs_iget_cache_hit( int flags, int lock_flags) __releases(pag->pag_ici_lock) { + struct inode *inode = VFS_I(ip); struct xfs_mount *mp = ip->i_mount; - int error = EAGAIN; + int error; + + spin_lock(&ip->i_flags_lock); /* - * If INEW is set this inode is being set up - * If IRECLAIM is set this inode is being torn down - * Pause and try again. + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. */ - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; goto out_error; } - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - - /* - * If lookup is racing with unlink, then we should return an - * error immediately so we don't remove it from the reclaim - * list and potentially leak the inode. - */ - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); /* - * We need to re-initialise the VFS inode as it has been - * 'freed' by the VFS. Do this here so we can deal with - * errors cleanly, then tag it so it can be set up correctly - * later. + * We need to set XFS_INEW atomically with clearing the + * reclaimable tag so that we do have an indicator of the + * inode still being initialized. */ - if (inode_init_always(mp->m_super, VFS_I(ip))) { - error = ENOMEM; - goto out_error; - } + ip->i_flags |= XFS_INEW; + ip->i_flags &= ~XFS_IRECLAIMABLE; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); - /* - * We must set the XFS_INEW flag before clearing the - * XFS_IRECLAIMABLE flag so that if a racing lookup does - * not find the XFS_IRECLAIMABLE above but has the igrab() - * below succeed we can safely check XFS_INEW to detect - * that this inode is still being initialised. - */ - xfs_iflags_set(ip, XFS_INEW); - xfs_iflags_clear(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); - /* clear the radix tree reclaim flag as well. */ - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - } else if (!igrab(VFS_I(ip))) { + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + read_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~XFS_INEW; + ip->i_flags |= XFS_IRECLAIMABLE; + __xfs_inode_set_reclaim_tag(pag, ip); + goto out_error; + } + inode->i_state = I_LOCK|I_NEW; + } else { /* If the VFS inode is being torn down, pause and try again. */ - XFS_STATS_INC(xs_ig_frecycle); - goto out_error; - } else if (xfs_iflags_test(ip, XFS_INEW)) { - /* - * We are racing with another cache hit that is - * currently recycling this inode out of the XFS_IRECLAIMABLE - * state. Wait for the initialisation to complete before - * continuing. - */ - wait_on_inode(VFS_I(ip)); - } + if (!igrab(inode)) { + error = EAGAIN; + goto out_error; + } - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - iput(VFS_I(ip)); - goto out_error; + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); } - /* We've got a live one. */ - read_unlock(&pag->pag_ici_lock); - if (lock_flags != 0) xfs_ilock(ip, lock_flags); @@ -274,6 +276,7 @@ xfs_iget_cache_hit( return 0; out_error: + spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); return error; } |