diff options
Diffstat (limited to 'fs')
188 files changed, 3362 insertions, 4227 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index e98f56d3787..e9cb57f0754 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -96,14 +96,10 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -int v9fs_check_acl(struct inode *inode, int mask) +struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type) { - struct posix_acl *acl; struct v9fs_session_info *v9ses; - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - v9ses = v9fs_inode2v9ses(inode); if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { @@ -111,18 +107,10 @@ int v9fs_check_acl(struct inode *inode, int mask) * On access = client and acl = on mode get the acl * values from the server */ - return 0; + return NULL; } - acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); + return v9fs_get_cached_acl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - return -EAGAIN; } static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) @@ -165,32 +153,32 @@ err_free_out: int v9fs_acl_chmod(struct dentry *dentry) { int retval = 0; - struct posix_acl *acl, *clone; + struct posix_acl *acl; struct inode *inode = dentry->d_inode; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { - clone = posix_acl_clone(acl, GFP_KERNEL); + retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (retval) + return retval; + retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - retval = posix_acl_chmod_masq(clone, inode->i_mode); - if (!retval) - retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, clone); - posix_acl_release(clone); } return retval; } int v9fs_set_create_acl(struct dentry *dentry, - struct posix_acl *dpacl, struct posix_acl *pacl) + struct posix_acl **dpacl, struct posix_acl **pacl) { - v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); - v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl); - posix_acl_release(dpacl); - posix_acl_release(pacl); + if (dentry) { + v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl); + v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl); + } + posix_acl_release(*dpacl); + posix_acl_release(*pacl); + *dpacl = *pacl = NULL; return 0; } @@ -209,29 +197,18 @@ int v9fs_acl_mode(struct inode *dir, mode_t *modep, mode &= ~current_umask(); } if (acl) { - struct posix_acl *clone; - if (S_ISDIR(mode)) - *dpacl = acl; - clone = posix_acl_clone(acl, GFP_NOFS); - retval = -ENOMEM; - if (!clone) - goto cleanup; - - retval = posix_acl_create_masq(clone, &mode); - if (retval < 0) { - posix_acl_release(clone); - goto cleanup; - } + *dpacl = posix_acl_dup(acl); + retval = posix_acl_create(&acl, GFP_NOFS, &mode); + if (retval < 0) + return retval; if (retval > 0) - *pacl = clone; + *pacl = acl; + else + posix_acl_release(acl); } *modep = mode; return 0; -cleanup: - posix_acl_release(acl); - return retval; - } static int v9fs_remote_get_acl(struct dentry *dentry, const char *name, diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 59e18c2e8c7..ddb7ae19d97 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -16,14 +16,14 @@ #ifdef CONFIG_9P_FS_POSIX_ACL extern int v9fs_get_acl(struct inode *, struct p9_fid *); -extern int v9fs_check_acl(struct inode *inode, int mask); +extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); extern int v9fs_acl_chmod(struct dentry *); extern int v9fs_set_create_acl(struct dentry *, - struct posix_acl *, struct posix_acl *); + struct posix_acl **, struct posix_acl **); extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); #else -#define v9fs_check_acl NULL +#define v9fs_iop_get_acl NULL static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { return 0; @@ -33,8 +33,8 @@ static inline int v9fs_acl_chmod(struct dentry *dentry) return 0; } static inline int v9fs_set_create_acl(struct dentry *dentry, - struct posix_acl *dpacl, - struct posix_acl *pacl) + struct posix_acl **dpacl, + struct posix_acl **pacl) { return 0; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 276f4a69ecd..9a26dce5a99 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -287,7 +287,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, goto error; /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); + v9fs_set_create_acl(dentry, &dacl, &pacl); v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); @@ -328,6 +328,7 @@ error: err_clunk_old_fid: if (ofid) p9_client_clunk(ofid); + v9fs_set_create_acl(NULL, &dacl, &pacl); return err; } @@ -421,12 +422,13 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, d_instantiate(dentry, inode); } /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); + v9fs_set_create_acl(dentry, &dacl, &pacl); inc_nlink(dir); v9fs_invalidate_inode_attr(dir); error: if (fid) p9_client_clunk(fid); + v9fs_set_create_acl(NULL, &dacl, &pacl); return err; } @@ -826,10 +828,11 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, d_instantiate(dentry, inode); } /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); + v9fs_set_create_acl(dentry, &dacl, &pacl); error: if (fid) p9_client_clunk(fid); + v9fs_set_create_acl(NULL, &dacl, &pacl); return err; } @@ -914,7 +917,7 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, - .check_acl = v9fs_check_acl, + .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { @@ -924,7 +927,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, - .check_acl = v9fs_check_acl, + .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { diff --git a/fs/Makefile b/fs/Makefile index fb68c2b8cf8..afc109691a9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -29,7 +29,6 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o -obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index c5567cb7843..f11e43ed907 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -187,7 +187,7 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); */ static struct inode *anon_inode_mkinode(void) { - struct inode *inode = new_inode(anon_inode_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -233,7 +233,7 @@ static int __init anon_inode_init(void) return 0; err_mntput: - mntput(anon_inode_mnt); + kern_unmount(anon_inode_mnt); err_unregister_filesystem: unregister_filesystem(&anon_inode_fs_type); err_exit: diff --git a/fs/block_dev.c b/fs/block_dev.c index 9fb0b15331d..f55aad4d161 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } - EXPORT_SYMBOL(I_BDEV); /* - * move the inode from it's current bdi to the a new bdi. if the inode is dirty - * we need to move it onto the dirty list of @dst so that the inode is always - * on the right list. + * Move the inode from its current bdi to a new bdi. If the inode is dirty we + * need to move it onto the dirty list of @dst so that the inode is always on + * the right list. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *old = inode->i_data.backing_dev_info; + + if (unlikely(dst == old)) /* deadlock avoidance */ + return; + bdi_lock_two(&old->wb, &dst->wb); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&old->wb.list_lock); + spin_unlock(&dst->wb.list_lock); } static sector_t max_block(struct block_device *bdev) @@ -1448,6 +1452,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) int blkdev_put(struct block_device *bdev, fmode_t mode) { + mutex_lock(&bdev->bd_mutex); + if (mode & FMODE_EXCL) { bool bdev_free; @@ -1456,7 +1462,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * are protected with bdev_lock. bd_mutex is to * synchronize disk_holder unlinking. */ - mutex_lock(&bdev->bd_mutex); spin_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); @@ -1474,17 +1479,21 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * If this was the last claim, remove holder link and * unblock evpoll if it was a write holder. */ - if (bdev_free) { - if (bdev->bd_write_holder) { - disk_unblock_events(bdev->bd_disk); - disk_check_events(bdev->bd_disk); - bdev->bd_write_holder = false; - } + if (bdev_free && bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; } - - mutex_unlock(&bdev->bd_mutex); } + /* + * Trigger event checking and tell drivers to flush MEDIA_CHANGE + * event. This is to ensure detection of media removal commanded + * from userland - e.g. eject(1). + */ + disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); + + mutex_unlock(&bdev->bd_mutex); + return __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9f62ab2a728..65a735d8f6e 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -30,7 +30,7 @@ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { int size; const char *name; @@ -195,27 +195,6 @@ out: return ret; } -int btrfs_check_acl(struct inode *inode, int mask) -{ - int error = -EAGAIN; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - error = -ECHILD; - } else { - struct posix_acl *acl; - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } - } - - return error; -} - /* * btrfs_init_acl is already generally called under fs_mutex, so the locking * stuff has been fixed to work with that. If the locking stuff changes, we @@ -243,8 +222,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, } if (IS_POSIXACL(dir) && acl) { - struct posix_acl *clone; - mode_t mode; + mode_t mode = inode->i_mode; if (S_ISDIR(inode->i_mode)) { ret = btrfs_set_acl(trans, inode, acl, @@ -252,22 +230,15 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, if (ret) goto failed; } - clone = posix_acl_clone(acl, GFP_NOFS); - ret = -ENOMEM; - if (!clone) - goto failed; + ret = posix_acl_create(&acl, GFP_NOFS, &mode); + if (ret < 0) + return ret; - mode = inode->i_mode; - ret = posix_acl_create_masq(clone, &mode); - if (ret >= 0) { - inode->i_mode = mode; - if (ret > 0) { - /* we need an acl */ - ret = btrfs_set_acl(trans, inode, clone, - ACL_TYPE_ACCESS); - } + inode->i_mode = mode; + if (ret > 0) { + /* we need an acl */ + ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); } - posix_acl_release(clone); } failed: posix_acl_release(acl); @@ -277,7 +248,7 @@ failed: int btrfs_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int ret = 0; if (S_ISLNK(inode->i_mode)) @@ -290,17 +261,11 @@ int btrfs_acl_chmod(struct inode *inode) if (IS_ERR_OR_NULL(acl)) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); + ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - - ret = posix_acl_chmod_masq(clone, inode->i_mode); - if (!ret) - ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS); - - posix_acl_release(clone); - return ret; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 52d7eca8c7b..502b9e98867 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -34,6 +34,9 @@ struct btrfs_inode { */ struct btrfs_key location; + /* Lock for counters */ + spinlock_t lock; + /* the extent_tree has caches of all the extent mappings to disk */ struct extent_map_tree extent_tree; @@ -134,8 +137,8 @@ struct btrfs_inode { * items we think we'll end up using, and reserved_extents is the number * of extent items we've reserved metadata for. */ - atomic_t outstanding_extents; - atomic_t reserved_extents; + unsigned outstanding_extents; + unsigned reserved_extents; /* * ordered_data_close is set by truncate when a file that used @@ -184,4 +187,13 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } +static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, + struct inode *inode) +{ + if (root == root->fs_info->tree_root || + BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + return true; + return false; +} + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2e667868e0d..011cab3aca8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) { int i; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (p->nodes[i] && p->locks[i]) - btrfs_set_lock_blocking(p->nodes[i]); + if (!p->nodes[i] || !p->locks[i]) + continue; + btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_READ_LOCK) + p->locks[i] = BTRFS_READ_LOCK_BLOCKING; + else if (p->locks[i] == BTRFS_WRITE_LOCK) + p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; } } @@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) * for held */ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held) + struct extent_buffer *held, int held_rw) { int i; @@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, * really sure by forcing the path to blocking before we clear * the path blocking. */ - if (held) - btrfs_set_lock_blocking(held); + if (held) { + btrfs_set_lock_blocking_rw(held, held_rw); + if (held_rw == BTRFS_WRITE_LOCK) + held_rw = BTRFS_WRITE_LOCK_BLOCKING; + else if (held_rw == BTRFS_READ_LOCK) + held_rw = BTRFS_READ_LOCK_BLOCKING; + } btrfs_set_path_blocking(p); #endif for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { - if (p->nodes[i] && p->locks[i]) - btrfs_clear_lock_blocking(p->nodes[i]); + if (p->nodes[i] && p->locks[i]) { + btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) + p->locks[i] = BTRFS_WRITE_LOCK; + else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) + p->locks[i] = BTRFS_READ_LOCK; + } } #ifdef CONFIG_DEBUG_LOCK_ALLOC if (held) - btrfs_clear_lock_blocking(held); + btrfs_clear_lock_blocking_rw(held, held_rw); #endif } @@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p) if (!p->nodes[i]) continue; if (p->locks[i]) { - btrfs_tree_unlock(p->nodes[i]); + btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); p->locks[i] = 0; } free_extent_buffer(p->nodes[i]); @@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) return eb; } +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_read_lock(eb); + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + /* cowonly root (everything not a reference counted cow subvolume), just get * put onto a simple dirty list. transaction.c walks this to make sure they * get properly updated on disk. @@ -626,14 +660,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, for (i = start_slot; i < end_slot; i++) { int close = 1; - if (!parent->map_token) { - map_extent_buffer(parent, - btrfs_node_key_ptr_offset(i), - sizeof(struct btrfs_key_ptr), - &parent->map_token, &parent->kaddr, - &parent->map_start, &parent->map_len, - KM_USER1); - } btrfs_node_key(parent, &disk_key, i); if (!progress_passed && comp_keys(&disk_key, progress) < 0) continue; @@ -656,11 +682,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, last_block = blocknr; continue; } - if (parent->map_token) { - unmap_extent_buffer(parent, parent->map_token, - KM_USER1); - parent->map_token = NULL; - } cur = btrfs_find_tree_block(root, blocknr, blocksize); if (cur) @@ -701,11 +722,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, btrfs_tree_unlock(cur); free_extent_buffer(cur); } - if (parent->map_token) { - unmap_extent_buffer(parent, parent->map_token, - KM_USER1); - parent->map_token = NULL; - } return err; } @@ -746,7 +762,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, struct btrfs_disk_key *tmp = NULL; struct btrfs_disk_key unaligned; unsigned long offset; - char *map_token = NULL; char *kaddr = NULL; unsigned long map_start = 0; unsigned long map_len = 0; @@ -756,18 +771,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb, mid = (low + high) / 2; offset = p + mid * item_size; - if (!map_token || offset < map_start || + if (!kaddr || offset < map_start || (offset + sizeof(struct btrfs_disk_key)) > map_start + map_len) { - if (map_token) { - unmap_extent_buffer(eb, map_token, KM_USER0); - map_token = NULL; - } err = map_private_extent_buffer(eb, offset, sizeof(struct btrfs_disk_key), - &map_token, &kaddr, - &map_start, &map_len, KM_USER0); + &kaddr, &map_start, &map_len); if (!err) { tmp = (struct btrfs_disk_key *)(kaddr + offset - @@ -790,14 +800,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb, high = mid; else { *slot = mid; - if (map_token) - unmap_extent_buffer(eb, map_token, KM_USER0); return 0; } } *slot = low; - if (map_token) - unmap_extent_buffer(eb, map_token, KM_USER0); return 1; } @@ -890,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = path->nodes[level]; - WARN_ON(!path->locks[level]); + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && + path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); @@ -1228,7 +1235,6 @@ static void reada_for_search(struct btrfs_root *root, u32 nr; u32 blocksize; u32 nscan = 0; - bool map = true; if (level != 1) return; @@ -1250,19 +1256,8 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; - if (node->map_token || path->skip_locking) - map = false; while (1) { - if (map && !node->map_token) { - unsigned long offset = btrfs_node_key_ptr_offset(nr); - map_private_extent_buffer(node, offset, - sizeof(struct btrfs_key_ptr), - &node->map_token, - &node->kaddr, - &node->map_start, - &node->map_len, KM_USER1); - } if (direction < 0) { if (nr == 0) break; @@ -1281,11 +1276,6 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - if (map && node->map_token) { - unmap_extent_buffer(node, node->map_token, - KM_USER1); - node->map_token = NULL; - } readahead_tree_block(root, search, blocksize, gen); nread += blocksize; } @@ -1293,10 +1283,6 @@ static void reada_for_search(struct btrfs_root *root, if ((nread > 65536 || nscan > 32)) break; } - if (map && node->map_token) { - unmap_extent_buffer(node, node->map_token, KM_USER1); - node->map_token = NULL; - } } /* @@ -1409,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level, t = path->nodes[i]; if (i >= lowest_unlock && i > skip_level && path->locks[i]) { - btrfs_tree_unlock(t); + btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; } } @@ -1436,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) continue; if (!path->locks[i]) continue; - btrfs_tree_unlock(path->nodes[i]); + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0; } } @@ -1485,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans, * we can trust our generation number */ free_extent_buffer(tmp); + btrfs_set_path_blocking(p); + tmp = read_tree_block(root, blocknr, blocksize, gen); if (tmp && btrfs_buffer_uptodate(tmp, gen)) { *eb_ret = tmp; @@ -1540,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans, static int setup_nodes_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer *b, int level, int ins_len) + struct extent_buffer *b, int level, int ins_len, + int *write_lock_level) { int ret; if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { int sret; + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + sret = reada_for_balance(root, p, level); if (sret) goto again; btrfs_set_path_blocking(p); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(sret > 0); if (sret) { @@ -1565,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { int sret; + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + sret = reada_for_balance(root, p, level); if (sret) goto again; btrfs_set_path_blocking(p); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); if (sret) { ret = sret; @@ -1615,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root int err; int level; int lowest_unlock = 1; + int root_lock; + /* everything at write_lock_level or lower must be write locked */ + int write_lock_level = 0; u8 lowest_level = 0; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); - if (ins_len < 0) + if (ins_len < 0) { lowest_unlock = 2; + /* when we are removing items, we might have to go up to level + * two as we update tree pointers Make sure we keep write + * for those levels as well + */ + write_lock_level = 2; + } else if (ins_len > 0) { + /* + * for inserting items, make sure we have a write lock on + * level 1 so we can update keys + */ + write_lock_level = 1; + } + + if (!cow) + write_lock_level = -1; + + if (cow && (p->keep_locks || p->lowest_level)) + write_lock_level = BTRFS_MAX_LEVEL; + again: + /* + * we try very hard to do read locks on the root + */ + root_lock = BTRFS_READ_LOCK; + level = 0; if (p->search_commit_root) { + /* + * the commit roots are read only + * so we always do read locks + */ b = root->commit_root; extent_buffer_get(b); + level = btrfs_header_level(b); if (!p->skip_locking) - btrfs_tree_lock(b); + btrfs_tree_read_lock(b); } else { - if (p->skip_locking) + if (p->skip_locking) { b = btrfs_root_node(root); - else - b = btrfs_lock_root_node(root); + level = btrfs_header_level(b); + } else { + /* we don't know the level of the root node + * until we actually have it read locked + */ + b = btrfs_read_lock_root_node(root); + level = btrfs_header_level(b); + if (level <= write_lock_level) { + /* whoops, must trade for write lock */ + btrfs_tree_read_unlock(b); + free_extent_buffer(b); + b = btrfs_lock_root_node(root); + root_lock = BTRFS_WRITE_LOCK; + + /* the level might have changed, check again */ + level = btrfs_header_level(b); + } + } } + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; while (b) { level = btrfs_header_level(b); @@ -1644,10 +1696,6 @@ again: * setup the path here so we can release it under lock * contention with the cow code */ - p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = 1; - if (cow) { /* * if we don't really need to cow this block @@ -1659,6 +1707,16 @@ again: btrfs_set_path_blocking(p); + /* + * must have write locks on this node and the + * parent + */ + if (level + 1 > write_lock_level) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], &b); @@ -1671,10 +1729,7 @@ cow_done: BUG_ON(!cow && ins_len); p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = 1; - - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); /* * we have a lock on b and as long as we aren't changing @@ -1700,7 +1755,7 @@ cow_done: } p->slots[level] = slot; err = setup_nodes_for_search(trans, root, p, b, level, - ins_len); + ins_len, &write_lock_level); if (err == -EAGAIN) goto again; if (err) { @@ -1710,6 +1765,19 @@ cow_done: b = p->nodes[level]; slot = p->slots[level]; + /* + * slot 0 is special, if we change the key + * we have to update the parent pointer + * which means we must have a write lock + * on the parent + */ + if (slot == 0 && cow && + write_lock_level < level + 1) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + unlock_up(p, level, lowest_unlock); if (level == lowest_level) { @@ -1728,23 +1796,42 @@ cow_done: } if (!p->skip_locking) { - btrfs_clear_path_blocking(p, NULL); - err = btrfs_try_spin_lock(b); - - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - btrfs_clear_path_blocking(p, b); + level = btrfs_header_level(b); + if (level <= write_lock_level) { + err = btrfs_try_tree_write_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_WRITE_LOCK); + } + p->locks[level] = BTRFS_WRITE_LOCK; + } else { + err = btrfs_try_tree_read_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + p->locks[level] = BTRFS_READ_LOCK; } + p->nodes[level] = b; } } else { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { + if (write_lock_level < 1) { + write_lock_level = 1; + btrfs_release_path(p); + goto again; + } + btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(err > 0); if (err) { @@ -2025,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; return 0; } @@ -2253,14 +2340,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] == i) push_space += data_size; - if (!left->map_token) { - map_extent_buffer(left, (unsigned long)item, - sizeof(struct btrfs_item), - &left->map_token, &left->kaddr, - &left->map_start, &left->map_len, - KM_USER1); - } - this_item_size = btrfs_item_size(left, item); if (this_item_size + sizeof(*item) + push_space > free_space) break; @@ -2271,10 +2350,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, break; i--; } - if (left->map_token) { - unmap_extent_buffer(left, left->map_token, KM_USER1); - left->map_token = NULL; - } if (push_items == 0) goto out_unlock; @@ -2316,21 +2391,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } push_space -= btrfs_item_size(right, item); btrfs_set_item_offset(right, item, push_space); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } left_nritems -= push_items; btrfs_set_header_nritems(left, left_nritems); @@ -2467,13 +2531,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < nr; i++) { item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } if (!empty && push_items > 0) { if (path->slots[0] < i) @@ -2496,11 +2553,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, push_space += this_item_size + sizeof(*item); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } - if (push_items == 0) { ret = 1; goto out; @@ -2530,23 +2582,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(left, i); - if (!left->map_token) { - map_extent_buffer(left, (unsigned long)item, - sizeof(struct btrfs_item), - &left->map_token, &left->kaddr, - &left->map_start, &left->map_len, - KM_USER1); - } ioff = btrfs_item_offset(left, item); btrfs_set_item_offset(left, item, ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); - if (left->map_token) { - unmap_extent_buffer(left, left->map_token, KM_USER1); - left->map_token = NULL; - } /* fixup right node */ if (push_items > right_nritems) { @@ -2574,21 +2615,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - push_space = push_space - btrfs_item_size(right, item); btrfs_set_item_offset(right, item, push_space); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } btrfs_mark_buffer_dirty(left); if (right_nritems) @@ -2729,23 +2758,10 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(right, item); btrfs_set_item_offset(right, item, ioff + rt_data_off); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } - btrfs_set_header_nritems(l, mid); ret = 0; btrfs_item_key(right, &disk_key, 0); @@ -3264,23 +3280,10 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff + size_diff); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the data */ if (from_end) { memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + @@ -3377,22 +3380,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff - data_size); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the data */ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + data_end - data_size, btrfs_leaf_data(leaf) + @@ -3494,27 +3485,13 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - WARN_ON(leaf->map_token); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff - total_data); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), btrfs_item_nr_offset(slot), @@ -3608,27 +3585,13 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - WARN_ON(leaf->map_token); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff - total_data); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), btrfs_item_nr_offset(slot), @@ -3840,22 +3803,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff + dsize); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), btrfs_item_nr_offset(slot + nr), sizeof(struct btrfs_item) * @@ -4004,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, WARN_ON(!path->keep_locks); again: - cur = btrfs_lock_root_node(root); + cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); WARN_ON(path->nodes[level]); path->nodes[level] = cur; - path->locks[level] = 1; + path->locks[level] = BTRFS_READ_LOCK; if (btrfs_header_generation(cur) < min_trans) { ret = 1; @@ -4098,12 +4049,12 @@ find_next_key: cur = read_node_slot(root, cur, slot); BUG_ON(!cur); - btrfs_tree_lock(cur); + btrfs_tree_read_lock(cur); - path->locks[level - 1] = 1; + path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; unlock_up(path, level, 1); - btrfs_clear_path_blocking(path, NULL); + btrfs_clear_path_blocking(path, NULL, 0); } out: if (ret == 0) @@ -4218,30 +4169,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) u32 nritems; int ret; int old_spinning = path->leave_spinning; - int force_blocking = 0; + int next_rw_lock = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; - /* - * we take the blocks in an order that upsets lockdep. Using - * blocking mode is the only way around it. - */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - force_blocking = 1; -#endif - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); again: level = 1; next = NULL; + next_rw_lock = 0; btrfs_release_path(path); path->keep_locks = 1; - - if (!force_blocking) - path->leave_spinning = 1; + path->leave_spinning = 1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; @@ -4281,11 +4223,12 @@ again: } if (next) { - btrfs_tree_unlock(next); + btrfs_tree_unlock_rw(next, next_rw_lock); free_extent_buffer(next); } next = c; + next_rw_lock = path->locks[level]; ret = read_block_for_search(NULL, root, path, &next, level, slot, &key); if (ret == -EAGAIN) @@ -4297,15 +4240,14 @@ again: } if (!path->skip_locking) { - ret = btrfs_try_spin_lock(next); + ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_lock(next); - if (!force_blocking) - btrfs_clear_path_blocking(path, next); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); } - if (force_blocking) - btrfs_set_lock_blocking(next); + next_rw_lock = BTRFS_READ_LOCK; } break; } @@ -4314,14 +4256,13 @@ again: level--; c = path->nodes[level]; if (path->locks[level]) - btrfs_tree_unlock(c); + btrfs_tree_unlock_rw(c, path->locks[level]); free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) - path->locks[level] = 1; - + path->locks[level] = next_rw_lock; if (!level) break; @@ -4336,16 +4277,14 @@ again: } if (!path->skip_locking) { - btrfs_assert_tree_locked(path->nodes[level]); - ret = btrfs_try_spin_lock(next); + ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_lock(next); - if (!force_blocking) - btrfs_clear_path_blocking(path, next); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); } - if (force_blocking) - btrfs_set_lock_blocking(next); + next_rw_lock = BTRFS_READ_LOCK; } } ret = 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 82be74efbb2..365c4e1dde0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -755,6 +755,8 @@ struct btrfs_space_info { chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + unsigned int flush:1; /* set if we are trying to make space */ + unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ @@ -764,7 +766,7 @@ struct btrfs_space_info { struct list_head block_groups[BTRFS_NR_RAID_TYPES]; spinlock_t lock; struct rw_semaphore groups_sem; - atomic_t caching_threads; + wait_queue_head_t wait; }; struct btrfs_block_rsv { @@ -824,6 +826,7 @@ struct btrfs_caching_control { struct list_head list; struct mutex mutex; wait_queue_head_t wait; + struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; atomic_t count; @@ -1032,6 +1035,8 @@ struct btrfs_fs_info { struct btrfs_workers endio_write_workers; struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; + struct btrfs_workers caching_workers; + /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -2128,7 +2133,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) /* extent-tree.c */ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, - int num_items) + unsigned num_items) { return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3 * num_items; @@ -2222,9 +2227,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int num_items); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, @@ -2330,7 +2332,7 @@ struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held); + struct extent_buffer *held, int held_rw); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2645,9 +2647,9 @@ do { \ /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -int btrfs_check_acl(struct inode *inode, int mask); +struct posix_acl *btrfs_get_acl(struct inode *inode, int type); #else -#define btrfs_check_acl NULL +#define btrfs_get_acl NULL #endif int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 98c68e658a9..b52c672f4c1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, } /* reset all the locked nodes in the patch to spinning locks. */ - btrfs_clear_path_blocking(path, NULL); + btrfs_clear_path_blocking(path, NULL, 0); /* insert the keys of the items */ ret = setup_items_for_insert(trans, root, path, keys, data_size, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 8d27af4bd8b..7083d08b2a2 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -25,7 +25,7 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "ctree.h" diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 685f2593c4f..c360a848d97 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); - /* - * FIXME: at some point we should handle xattr's that are larger than - * what we can fit in our leaf. We set location to NULL b/c we arent - * pointing at anything else, that will change if we store the xattr - * data in a separate inode. - */ - BUG_ON(IS_ERR(dir_item)); + if (IS_ERR(dir_item)) + return PTR_ERR(dir_item); memset(&location, 0, sizeof(location)); leaf = path->nodes[0]; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b231ae13b26..07b3ac662e1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -100,38 +100,83 @@ struct async_submit_bio { struct btrfs_work work; }; -/* These are used to set the lockdep class on the extent buffer locks. - * The class is set by the readpage_end_io_hook after the buffer has - * passed csum validation but before the pages are unlocked. +/* + * Lockdep class keys for extent_buffer->lock's in this root. For a given + * eb, the lockdep key is determined by the btrfs_root it belongs to and + * the level the eb occupies in the tree. + * + * Different roots are used for different purposes and may nest inside each + * other and they require separate keysets. As lockdep keys should be + * static, assign keysets according to the purpose of the root as indicated + * by btrfs_root->objectid. This ensures that all special purpose roots + * have separate keysets. * - * The lockdep class is also set by btrfs_init_new_buffer on freshly - * allocated blocks. + * Lock-nesting across peer nodes is always done with the immediate parent + * node locked thus preventing deadlock. As lockdep doesn't know this, use + * subclass to avoid triggering lockdep warning in such cases. * - * The class is based on the level in the tree block, which allows lockdep - * to know that lower nodes nest inside the locks of higher nodes. + * The key is set by the readpage_end_io_hook after the buffer has passed + * csum validation but before the pages are unlocked. It is also set by + * btrfs_init_new_buffer on freshly allocated blocks. * - * We also add a check to make sure the highest level of the tree is - * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this - * code needs update as well. + * We also add a check to make sure the highest level of the tree is the + * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code + * needs update as well. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC # if BTRFS_MAX_LEVEL != 8 # error # endif -static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; -static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { - /* leaf */ - "btrfs-extent-00", - "btrfs-extent-01", - "btrfs-extent-02", - "btrfs-extent-03", - "btrfs-extent-04", - "btrfs-extent-05", - "btrfs-extent-06", - "btrfs-extent-07", - /* highest possible level */ - "btrfs-extent-08", + +static struct btrfs_lockdep_keyset { + u64 id; /* root objectid */ + const char *name_stem; /* lock name stem */ + char names[BTRFS_MAX_LEVEL + 1][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; +} btrfs_lockdep_keysets[] = { + { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, + { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, + { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, + { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, + { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" }, + { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, + { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, + { .id = 0, .name_stem = "tree" }, }; + +void __init btrfs_init_lockdep(void) +{ + int i, j; + + /* initialize lockdep class names */ + for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { + struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; + + for (j = 0; j < ARRAY_SIZE(ks->names); j++) + snprintf(ks->names[j], sizeof(ks->names[j]), + "btrfs-%s-%02d", ks->name_stem, j); + } +} + +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, + int level) +{ + struct btrfs_lockdep_keyset *ks; + + BUG_ON(level >= ARRAY_SIZE(ks->keys)); + + /* find the matching keyset, id 0 is the default entry */ + for (ks = btrfs_lockdep_keysets; ks->id; ks++) + if (ks->id == objectid) + break; + + lockdep_set_class_and_name(&eb->lock, + &ks->keys[level], ks->names[level]); +} + #endif /* @@ -217,7 +262,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; - char *map_token = NULL; char *kaddr; unsigned long map_start; unsigned long map_len; @@ -228,8 +272,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, len = buf->len - offset; while (len > 0) { err = map_private_extent_buffer(buf, offset, 32, - &map_token, &kaddr, - &map_start, &map_len, KM_USER0); + &kaddr, &map_start, &map_len); if (err) return 1; cur_len = min(len, map_len - (offset - map_start)); @@ -237,7 +280,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, crc, cur_len); len -= cur_len; offset += cur_len; - unmap_extent_buffer(buf, map_token, KM_USER0); } if (csum_size > sizeof(inline_result)) { result = kzalloc(csum_size * sizeof(char), GFP_NOFS); @@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) -{ - lockdep_set_class_and_name(&eb->lock, - &btrfs_eb_class[level], - btrfs_eb_name[level]); -} -#endif - static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, } found_level = btrfs_header_level(eb); - btrfs_set_buffer_lockdep_class(eb, found_level); + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), + eb, found_level); ret = csum_tree_block(root, eb, 1); if (ret) { @@ -1598,7 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_bdi; } - fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -1802,6 +1836,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->thread_pool_size), &fs_info->generic_worker); + btrfs_init_workers(&fs_info->caching_workers, "cache", + 2, &fs_info->generic_worker); + /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the * devices @@ -1855,6 +1892,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_write_workers, 1); btrfs_start_workers(&fs_info->endio_freespace_worker, 1); btrfs_start_workers(&fs_info->delayed_workers, 1); + btrfs_start_workers(&fs_info->caching_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2112,6 +2150,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); + btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: kfree(fs_info->delayed_root); fail_iput: @@ -2577,6 +2616,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); + btrfs_stop_workers(&fs_info->caching_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a0b610a67aa..bec3ea4bd67 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,10 +87,14 @@ int btree_lock_page_hook(struct page *page); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +void btrfs_init_lockdep(void); +void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level); #else -static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, - int level) +static inline void btrfs_init_lockdep(void) +{ } +static inline void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level) { } #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71cd456fdb6..4d08ed79405 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -320,12 +320,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, return total_added; } -static int caching_kthread(void *data) +static noinline void caching_thread(struct btrfs_work *work) { - struct btrfs_block_group_cache *block_group = data; - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; @@ -334,9 +334,14 @@ static int caching_kthread(void *data) u32 nritems; int ret = 0; + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; + path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + goto out; last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -433,13 +438,11 @@ err: free_excluded_extents(extent_root, block_group); mutex_unlock(&caching_ctl->mutex); +out: wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); - atomic_dec(&block_group->space_info->caching_threads); btrfs_put_block_group(block_group); - - return 0; } static int cache_block_group(struct btrfs_block_group_cache *cache, @@ -449,7 +452,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; - struct task_struct *tsk; int ret = 0; smp_mb(); @@ -501,6 +503,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->progress = cache->key.objectid; /* one for caching kthread, one for caching block group list */ atomic_set(&caching_ctl->count, 2); + caching_ctl->work.func = caching_thread; spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { @@ -516,16 +519,9 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); - atomic_inc(&cache->space_info->caching_threads); btrfs_get_block_group(cache); - tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", - cache->key.objectid); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - printk(KERN_ERR "error running thread %d\n", ret); - BUG(); - } + btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); return ret; } @@ -2932,9 +2928,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; + found->flush = 0; + init_waitqueue_head(&found->wait); *space_info = found; list_add_rcu(&found->list, &info->space_info); - atomic_set(&found->caching_threads, 0); return 0; } @@ -3314,6 +3311,14 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; + smp_mb(); + if (root->fs_info->delalloc_bytes == 0) { + if (trans) + return 0; + btrfs_wait_ordered_extents(root, 0, 0); + return 0; + } + max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { @@ -3356,6 +3361,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } + if (reclaimed >= to_reclaim && !trans) + btrfs_wait_ordered_extents(root, 0, 0); return reclaimed >= to_reclaim; } @@ -3380,15 +3387,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; - bool reserved = false; bool committed = false; + bool flushing = false; again: - ret = -ENOSPC; - if (reserved) - num_bytes = 0; - + ret = 0; spin_lock(&space_info->lock); + /* + * We only want to wait if somebody other than us is flushing and we are + * actually alloed to flush. + */ + while (flush && !flushing && space_info->flush) { + spin_unlock(&space_info->lock); + /* + * If we have a trans handle we can't wait because the flusher + * may have to commit the transaction, which would mean we would + * deadlock since we are waiting for the flusher to finish, but + * hold the current transaction open. + */ + if (trans) + return -EAGAIN; + ret = wait_event_interruptible(space_info->wait, + !space_info->flush); + /* Must have been interrupted, return */ + if (ret) + return -EINTR; + + spin_lock(&space_info->lock); + } + + ret = -ENOSPC; unused = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -3403,8 +3431,7 @@ again: if (unused <= space_info->total_bytes) { unused = space_info->total_bytes - unused; if (unused >= num_bytes) { - if (!reserved) - space_info->bytes_reserved += orig_bytes; + space_info->bytes_reserved += orig_bytes; ret = 0; } else { /* @@ -3429,17 +3456,14 @@ again: * to reclaim space we can actually use it instead of somebody else * stealing it from us. */ - if (ret && !reserved) { - space_info->bytes_reserved += orig_bytes; - reserved = true; + if (ret && flush) { + flushing = true; + space_info->flush = 1; } spin_unlock(&space_info->lock); - if (!ret) - return 0; - - if (!flush) + if (!ret || !flush) goto out; /* @@ -3447,11 +3471,11 @@ again: * metadata until after the IO is completed. */ ret = shrink_delalloc(trans, root, num_bytes, 1); - if (ret > 0) - return 0; - else if (ret < 0) + if (ret < 0) goto out; + ret = 0; + /* * So if we were overcommitted it's possible that somebody else flushed * out enough space and we simply didn't have enough space to reclaim, @@ -3462,11 +3486,11 @@ again: goto again; } - spin_lock(&space_info->lock); /* * Not enough space to be reclaimed, don't bother committing the * transaction. */ + spin_lock(&space_info->lock); if (space_info->bytes_pinned < orig_bytes) ret = -ENOSPC; spin_unlock(&space_info->lock); @@ -3474,10 +3498,13 @@ again: goto out; ret = -EAGAIN; - if (trans || committed) + if (trans) goto out; ret = -ENOSPC; + if (committed) + goto out; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto out; @@ -3489,12 +3516,12 @@ again: } out: - if (reserved) { + if (flushing) { spin_lock(&space_info->lock); - space_info->bytes_reserved -= orig_bytes; + space_info->flush = 0; + wake_up_all(&space_info->wait); spin_unlock(&space_info->lock); } - return ret; } @@ -3704,7 +3731,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (commit_trans) { if (trans) return -EAGAIN; - trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); @@ -3874,26 +3900,6 @@ int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, return 0; } -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int num_items) -{ - u64 num_bytes; - int ret; - - if (num_items == 0 || root->fs_info->chunk_root == root) - return 0; - - num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, - num_bytes); - if (!ret) { - trans->bytes_reserved += num_bytes; - trans->block_rsv = &root->fs_info->trans_block_rsv; - } - return ret; -} - void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -3944,6 +3950,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } +static unsigned drop_outstanding_extent(struct inode *inode) +{ + unsigned dropped_extents = 0; + + spin_lock(&BTRFS_I(inode)->lock); + BUG_ON(!BTRFS_I(inode)->outstanding_extents); + BTRFS_I(inode)->outstanding_extents--; + + /* + * If we have more or the same amount of outsanding extents than we have + * reserved then we need to leave the reserved extents count alone. + */ + if (BTRFS_I(inode)->outstanding_extents >= + BTRFS_I(inode)->reserved_extents) + goto out; + + dropped_extents = BTRFS_I(inode)->reserved_extents - + BTRFS_I(inode)->outstanding_extents; + BTRFS_I(inode)->reserved_extents -= dropped_extents; +out: + spin_unlock(&BTRFS_I(inode)->lock); + return dropped_extents; +} + static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) { return num_bytes >>= 3; @@ -3953,9 +3983,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; - u64 to_reserve; - int nr_extents; - int reserved_extents; + u64 to_reserve = 0; + unsigned nr_extents = 0; int ret; if (btrfs_transaction_in_commit(root->fs_info)) @@ -3963,66 +3992,49 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); - nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; - reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + + if (BTRFS_I(inode)->outstanding_extents > + BTRFS_I(inode)->reserved_extents) { + nr_extents = BTRFS_I(inode)->outstanding_extents - + BTRFS_I(inode)->reserved_extents; + BTRFS_I(inode)->reserved_extents += nr_extents; - if (nr_extents > reserved_extents) { - nr_extents -= reserved_extents; to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); - } else { - nr_extents = 0; - to_reserve = 0; } + spin_unlock(&BTRFS_I(inode)->lock); to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); - if (ret) + if (ret) { + unsigned dropped; + /* + * We don't need the return value since our reservation failed, + * we just need to clean up our counter. + */ + dropped = drop_outstanding_extent(inode); + WARN_ON(dropped > 1); return ret; - - atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents); - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + } block_rsv_add_bytes(block_rsv, to_reserve, 1); - if (block_rsv->size > 512 * 1024 * 1024) - shrink_delalloc(NULL, root, to_reserve, 0); - return 0; } void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 to_free; - int nr_extents; - int reserved_extents; + u64 to_free = 0; + unsigned dropped; num_bytes = ALIGN(num_bytes, root->sectorsize); - atomic_dec(&BTRFS_I(inode)->outstanding_extents); - WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); - - reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); - do { - int old, new; - - nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); - if (nr_extents >= reserved_extents) { - nr_extents = 0; - break; - } - old = reserved_extents; - nr_extents = reserved_extents - nr_extents; - new = reserved_extents - nr_extents; - old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents, - reserved_extents, new); - if (likely(old == reserved_extents)) - break; - reserved_extents = old; - } while (1); + dropped = drop_outstanding_extent(inode); to_free = calc_csum_metadata_size(inode, num_bytes); - if (nr_extents > 0) - to_free += btrfs_calc_trans_metadata_size(root, nr_extents); + if (dropped > 0) + to_free += btrfs_calc_trans_metadata_size(root, dropped); btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); @@ -4990,14 +5002,10 @@ have_block_group: } /* - * We only want to start kthread caching if we are at - * the point where we will wait for caching to make - * progress, or if our ideal search is over and we've - * found somebody to start caching. + * The caching workers are limited to 2 threads, so we + * can queue as much work as we care to. */ - if (loop > LOOP_CACHING_NOWAIT || - (loop > LOOP_FIND_IDEAL && - atomic_read(&space_info->caching_threads) < 2)) { + if (loop > LOOP_FIND_IDEAL) { ret = cache_block_group(block_group, trans, orig_root, 0); BUG_ON(ret); @@ -5219,8 +5227,7 @@ loop: if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; loop++; - if (!ideal_cache_percent && - atomic_read(&space_info->caching_threads)) + if (!ideal_cache_percent) goto search; /* @@ -5623,7 +5630,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); - btrfs_set_buffer_lockdep_class(buf, level); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); @@ -5910,7 +5917,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, return 1; if (path->locks[level] && !wc->keep_locks) { - btrfs_tree_unlock(eb); + btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; @@ -5934,7 +5941,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * keep the tree lock */ if (path->locks[level] && level > 0) { - btrfs_tree_unlock(eb); + btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; @@ -6047,7 +6054,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, BUG_ON(level != btrfs_header_level(next)); path->nodes[level] = next; path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; @@ -6118,7 +6125,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(level == 0); btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, eb->start, eb->len, @@ -6127,8 +6134,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { - btrfs_tree_unlock(eb); - path->locks[level] = 0; + btrfs_tree_unlock_rw(eb, path->locks[level]); return 1; } } @@ -6150,7 +6156,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } clean_tree_block(trans, root, eb); } @@ -6229,7 +6235,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, return 0; if (path->locks[level]) { - btrfs_tree_unlock(path->nodes[level]); + btrfs_tree_unlock_rw(path->nodes[level], + path->locks[level]); path->locks[level] = 0; } free_extent_buffer(path->nodes[level]); @@ -6281,7 +6288,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, path->nodes[level] = btrfs_lock_root_node(root); btrfs_set_lock_blocking(path->nodes[level]); path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { @@ -6449,7 +6456,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -6524,15 +6531,28 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int set_block_group_ro(struct btrfs_block_group_cache *cache) +static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; + u64 min_allocable_bytes; int ret = -ENOSPC; if (cache->ro) return 0; + /* + * We need some metadata space and system metadata space for + * allocating chunks in some corner cases until we force to set + * it to be readonly. + */ + if ((sinfo->flags & + (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && + !force) + min_allocable_bytes = 1 * 1024 * 1024; + else + min_allocable_bytes = 0; + spin_lock(&sinfo->lock); spin_lock(&cache->lock); num_bytes = cache->key.offset - cache->reserved - cache->pinned - @@ -6540,7 +6560,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache) if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { + cache->reserved_pinned + num_bytes + min_allocable_bytes <= + sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; sinfo->bytes_reserved += cache->reserved_pinned; cache->reserved_pinned = 0; @@ -6571,7 +6592,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, CHUNK_ALLOC_FORCE); - ret = set_block_group_ro(cache); + ret = set_block_group_ro(cache, 0); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); @@ -6579,7 +6600,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; - ret = set_block_group_ro(cache); + ret = set_block_group_ro(cache, 0); out: btrfs_end_transaction(trans, root); return ret; @@ -7016,7 +7037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_ro(cache); + set_block_group_ro(cache, 1); } list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { @@ -7030,9 +7051,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) * mirrored block groups. */ list_for_each_entry(cache, &space_info->block_groups[3], list) - set_block_group_ro(cache); + set_block_group_ro(cache, 1); list_for_each_entry(cache, &space_info->block_groups[4], list) - set_block_group_ro(cache); + set_block_group_ro(cache, 1); } init_global_block_rsv(info); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11c1ef..067b1747421 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -281,11 +281,10 @@ static int merge_state(struct extent_io_tree *tree, if (other->start == state->end + 1 && other->state == state->state) { merge_cb(tree, state, other); - other->start = state->start; - state->tree = NULL; - rb_erase(&state->rb_node, &tree->state); - free_extent_state(state); - state = NULL; + state->end = other->end; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); } } @@ -351,7 +350,6 @@ static int insert_state(struct extent_io_tree *tree, "%llu %llu\n", (unsigned long long)found->start, (unsigned long long)found->end, (unsigned long long)start, (unsigned long long)end); - free_extent_state(state); return -EEXIST; } state->tree = tree; @@ -500,7 +498,8 @@ again: cached_state = NULL; } - if (cached && cached->tree && cached->start == start) { + if (cached && cached->tree && cached->start <= start && + cached->end > start) { if (clear) atomic_dec(&cached->refs); state = cached; @@ -742,7 +741,8 @@ again: spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; - if (state->start == start && state->tree) { + if (state->start <= start && state->end > start && + state->tree) { node = &state->rb_node; goto hit_next; } @@ -783,13 +783,13 @@ hit_next: if (err) goto out; - next_node = rb_next(node); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; + next_node = rb_next(&state->rb_node); if (next_node && start < end && prealloc && !need_resched()) { state = rb_entry(next_node, struct extent_state, rb_node); @@ -862,7 +862,6 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - atomic_inc(&prealloc->refs); err = insert_state(tree, prealloc, start, this_end, &bits); BUG_ON(err == -EEXIST); @@ -872,7 +871,6 @@ hit_next: goto out; } cache_state(prealloc, cached_state); - free_extent_state(prealloc); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1564,7 +1562,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bitset = 0; spin_lock(&tree->lock); - if (cached && cached->tree && cached->start == start) + if (cached && cached->tree && cached->start <= start && + cached->end > start) node = &cached->rb_node; else node = tree_search(tree, start); @@ -2432,6 +2431,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -2442,11 +2442,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, end = wbc->range_end >> PAGE_CACHE_SHIFT; scanned = 1; } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, min(end - index, - (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; @@ -2551,7 +2556,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, }; struct writeback_control wbc_writepages = { .sync_mode = wbc->sync_mode, - .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_end = (loff_t)-1, @@ -2584,7 +2588,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, }; struct writeback_control wbc_writepages = { .sync_mode = mode, - .older_than_this = NULL, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, @@ -3022,8 +3025,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; - spin_lock_init(&eb->lock); - init_waitqueue_head(&eb->lock_wq); + rwlock_init(&eb->lock); + atomic_set(&eb->write_locks, 0); + atomic_set(&eb->read_locks, 0); + atomic_set(&eb->blocking_readers, 0); + atomic_set(&eb->blocking_writers, 0); + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->spinning_writers, 0); + init_waitqueue_head(&eb->write_lock_wq); + init_waitqueue_head(&eb->read_lock_wq); #if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); @@ -3119,7 +3129,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, i = 0; } for (; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); + p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; @@ -3266,6 +3276,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, return was_dirty; } +static int __eb_straddles_pages(u64 start, u64 len) +{ + if (len < PAGE_CACHE_SIZE) + return 1; + if (start & (PAGE_CACHE_SIZE - 1)) + return 1; + if ((start + len) & (PAGE_CACHE_SIZE - 1)) + return 1; + return 0; +} + +static int eb_straddles_pages(struct extent_buffer *eb) +{ + return __eb_straddles_pages(eb->start, eb->len); +} + int clear_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb, struct extent_state **cached_state) @@ -3277,8 +3303,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); + if (eb_straddles_pages(eb)) { + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3296,8 +3324,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); + if (eb_straddles_pages(eb)) { + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + NULL, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || @@ -3320,9 +3350,12 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; + if (__eb_straddles_pages(start, end - start + 1)) { + ret = test_range_bit(tree, start, end, + EXTENT_UPTODATE, 1, NULL); + if (ret) + return 1; + } while (start <= end) { index = start >> PAGE_CACHE_SHIFT; page = find_get_page(tree->mapping, index); @@ -3350,10 +3383,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 1; - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; + if (eb_straddles_pages(eb)) { + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, cached_state); + if (ret) + return ret; + } num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { @@ -3386,9 +3421,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; + if (eb_straddles_pages(eb)) { + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, NULL)) { + return 0; + } } if (start) { @@ -3492,9 +3529,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, page = extent_buffer_page(eb, i); cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(dst, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER1); dst += cur; len -= cur; @@ -3504,9 +3540,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km) + unsigned long *map_len) { size_t offset = start & (PAGE_CACHE_SIZE - 1); char *kaddr; @@ -3536,42 +3572,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } p = extent_buffer_page(eb, i); - kaddr = kmap_atomic(p, km); - *token = kaddr; + kaddr = page_address(p); *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; return 0; } -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, - char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - int err; - int save = 0; - if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, km); - eb->map_token = NULL; - save = 1; - } - err = map_private_extent_buffer(eb, start, min_len, token, map, - map_start, map_len, km); - if (!err && save) { - eb->map_token = *token; - eb->kaddr = *map; - eb->map_start = *map_start; - eb->map_len = *map_len; - } - return err; -} - -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) -{ - kunmap_atomic(token, km); -} - int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) @@ -3595,9 +3601,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); ret = memcmp(ptr, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER0); if (ret) break; @@ -3630,9 +3635,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(kaddr + offset, src, cur); - kunmap_atomic(kaddr, KM_USER1); src += cur; len -= cur; @@ -3661,9 +3665,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); memset(kaddr + offset, c, cur); - kunmap_atomic(kaddr, KM_USER0); len -= cur; offset = 0; @@ -3694,9 +3697,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); read_extent_buffer(src, kaddr + offset, src_offset, cur); - kunmap_atomic(kaddr, KM_USER0); src_offset += cur; len -= cur; @@ -3709,20 +3711,17 @@ static void move_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); if (dst_page == src_page) { memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); } else { - char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *src_kaddr = page_address(src_page); char *p = dst_kaddr + dst_off + len; char *s = src_kaddr + src_off + len; while (len--) *--p = *--s; - - kunmap_atomic(src_kaddr, KM_USER1); } - kunmap_atomic(dst_kaddr, KM_USER0); } static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) @@ -3735,20 +3734,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); char *src_kaddr; if (dst_page != src_page) { - src_kaddr = kmap_atomic(src_page, KM_USER1); + src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; BUG_ON(areas_overlap(src_off, dst_off, len)); } memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); - kunmap_atomic(dst_kaddr, KM_USER0); - if (dst_page != src_page) - kunmap_atomic(src_kaddr, KM_USER1); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a11a92ee2d3..21a7ca9e728 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -120,8 +120,6 @@ struct extent_state { struct extent_buffer { u64 start; unsigned long len; - char *map_token; - char *kaddr; unsigned long map_start; unsigned long map_len; struct page *first_page; @@ -130,14 +128,26 @@ struct extent_buffer { struct rcu_head rcu_head; atomic_t refs; - /* the spinlock is used to protect most operations */ - spinlock_t lock; + /* count of read lock holders on the extent buffer */ + atomic_t write_locks; + atomic_t read_locks; + atomic_t blocking_writers; + atomic_t blocking_readers; + atomic_t spinning_readers; + atomic_t spinning_writers; + + /* protects write locks */ + rwlock_t lock; - /* - * when we keep the lock held while blocking, waiters go onto - * the wq + /* readers use lock_wq while they wait for the write + * lock holders to unlock */ - wait_queue_head_t lock_wq; + wait_queue_head_t write_lock_wq; + + /* writers use read_lock_wq while they wait for readers + * to unlock + */ + wait_queue_head_t read_lock_wq; }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -279,15 +289,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, int extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb, struct extent_state *cached_state); -int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km); -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); + unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90d4ee52cd4..08bcfa92a22 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -177,6 +177,15 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, WARN_ON(bio->bi_vcnt <= 0); + /* + * the free space stuff is only read when it hasn't been + * updated in the current transaction. So, we can safely + * read from the commit root and sidestep a nasty deadlock + * between reading the free space cache and updating the csum tree. + */ + if (btrfs_is_free_space_inode(root, inode)) + path->search_commit_root = 1; + disk_bytenr = (u64)bio->bi_sector << 9; if (dio) offset = logical_offset; @@ -664,10 +673,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_sector_sum *sector_sum; u32 nritems; u32 ins_size; - char *eb_map; - char *eb_token; - unsigned long map_len; - unsigned long map_start; u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); @@ -814,30 +819,9 @@ found: item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + btrfs_item_size_nr(leaf, path->slots[0])); - eb_token = NULL; next_sector: - if (!eb_token || - (unsigned long)item + csum_size >= map_start + map_len) { - int err; - - if (eb_token) - unmap_extent_buffer(leaf, eb_token, KM_USER1); - eb_token = NULL; - err = map_private_extent_buffer(leaf, (unsigned long)item, - csum_size, - &eb_token, &eb_map, - &map_start, &map_len, KM_USER1); - if (err) - eb_token = NULL; - } - if (eb_token) { - memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), - §or_sum->sum, csum_size); - } else { - write_extent_buffer(leaf, §or_sum->sum, - (unsigned long)item, csum_size); - } + write_extent_buffer(leaf, §or_sum->sum, (unsigned long)item, csum_size); total_bytes += root->sectorsize; sector_sum++; @@ -850,10 +834,7 @@ next_sector: goto next_sector; } } - if (eb_token) { - unmap_extent_buffer(leaf, eb_token, KM_USER1); - eb_token = NULL; - } + btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 59cbdb120ad..a35e51c9f23 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1081,7 +1081,8 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, again: for (i = 0; i < num_pages; i++) { - pages[i] = grab_cache_page(inode->i_mapping, index + i); + pages[i] = find_or_create_page(inode->i_mapping, index + i, + GFP_NOFS); if (!pages[i]) { faili = i - 1; err = -ENOMEM; @@ -1238,9 +1239,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * managed to copy. */ if (num_pages > dirty_pages) { - if (copied > 0) - atomic_inc( - &BTRFS_I(inode)->outstanding_extents); + if (copied > 0) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } btrfs_delalloc_release_space(inode, (num_pages - dirty_pages) << PAGE_CACHE_SHIFT); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bf0d61567f3..6377713f639 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { + printk(KERN_INFO "Old style space inode found, converting.\n"); + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; + block_group->disk_cache_state = BTRFS_DC_CLEAR; + } + if (!btrfs_fs_closing(root->fs_info)) { block_group->inode = igrab(inode); block_group->iref = 1; @@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); + BTRFS_INODE_PREALLOC); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -239,17 +245,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct page *page; - u32 *checksums = NULL, *crc; - char *disk_crcs = NULL; struct btrfs_key key; struct list_head bitmaps; u64 num_entries; u64 num_bitmaps; u64 generation; - u32 cur_crc = ~(u32)0; pgoff_t index = 0; - unsigned long first_page_offset; - int num_checksums; int ret = 0; INIT_LIST_HEAD(&bitmaps); @@ -292,16 +293,6 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) goto out; - /* Setup everything for doing checksumming */ - num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; - checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); - if (!checksums) - goto out; - first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); - disk_crcs = kzalloc(first_page_offset, GFP_NOFS); - if (!disk_crcs) - goto out; - ret = readahead_cache(inode); if (ret) goto out; @@ -311,18 +302,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space *e; void *addr; unsigned long offset = 0; - unsigned long start_offset = 0; int need_loop = 0; if (!num_entries && !num_bitmaps) break; - if (index == 0) { - start_offset = first_page_offset; - offset = start_offset; - } - - page = grab_cache_page(inode->i_mapping, index); + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) goto free_cache; @@ -342,8 +327,15 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (index == 0) { u64 *gen; - memcpy(disk_crcs, addr, first_page_offset); - gen = addr + (sizeof(u32) * num_checksums); + /* + * We put a bogus crc in the front of the first page in + * case old kernels try to mount a fs with the new + * format to make sure they discard the cache. + */ + addr += sizeof(u64); + offset += sizeof(u64); + + gen = addr; if (*gen != BTRFS_I(inode)->generation) { printk(KERN_ERR "btrfs: space cache generation" " (%llu) does not match inode (%llu)\n", @@ -355,24 +347,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, page_cache_release(page); goto free_cache; } - crc = (u32 *)disk_crcs; - } - entry = addr + start_offset; - - /* First lets check our crc before we do anything fun */ - cur_crc = ~(u32)0; - cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, - PAGE_CACHE_SIZE - start_offset); - btrfs_csum_final(cur_crc, (char *)&cur_crc); - if (cur_crc != *crc) { - printk(KERN_ERR "btrfs: crc mismatch for page %lu\n", - index); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; + addr += sizeof(u64); + offset += sizeof(u64); } - crc++; + entry = addr; while (1) { if (!num_entries) @@ -470,8 +448,6 @@ next: ret = 1; out: - kfree(checksums); - kfree(disk_crcs); return ret; free_cache: __btrfs_remove_free_space_cache(ctl); @@ -569,8 +545,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_key key; u64 start, end, len; u64 bytes = 0; - u32 *crc, *checksums; - unsigned long first_page_offset; + u32 crc = ~(u32)0; int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; @@ -590,34 +565,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* Since the first page has all of our checksums and our generation we - * need to calculate the offset into the page that we can start writing - * our entries. - */ - first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); - filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & ~(root->sectorsize - 1), (u64)-1); - /* make sure we don't overflow that first page */ - if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) { - /* this is really the same as running out of space, where we also return 0 */ - printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n"); - ret = 0; - goto out_update; - } - - /* We need a checksum per page. */ - crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); - if (!crc) - return -1; - pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); - if (!pages) { - kfree(crc); + if (!pages) return -1; - } /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -640,7 +594,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * know and don't freak out. */ while (index < num_pages) { - page = grab_cache_page(inode->i_mapping, index); + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { int i; @@ -648,7 +602,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, unlock_page(pages[i]); page_cache_release(pages[i]); } - goto out_free; + goto out; } pages[index] = page; index++; @@ -668,17 +622,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Write out the extent entries */ do { struct btrfs_free_space_entry *entry; - void *addr; + void *addr, *orig; unsigned long offset = 0; - unsigned long start_offset = 0; next_page = false; - if (index == 0) { - start_offset = first_page_offset; - offset = start_offset; - } - if (index >= num_pages) { out_of_space = true; break; @@ -686,10 +634,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, page = pages[index]; - addr = kmap(page); - entry = addr + start_offset; + orig = addr = kmap(page); + if (index == 0) { + u64 *gen; - memset(addr, 0, PAGE_CACHE_SIZE); + /* + * We're going to put in a bogus crc for this page to + * make sure that old kernels who aren't aware of this + * format will be sure to discard the cache. + */ + addr += sizeof(u64); + offset += sizeof(u64); + + gen = addr; + *gen = trans->transid; + addr += sizeof(u64); + offset += sizeof(u64); + } + entry = addr; + + memset(addr, 0, PAGE_CACHE_SIZE - offset); while (node && !next_page) { struct btrfs_free_space *e; @@ -752,13 +716,19 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, next_page = true; entry++; } - *crc = ~(u32)0; - *crc = btrfs_csum_data(root, addr + start_offset, *crc, - PAGE_CACHE_SIZE - start_offset); - kunmap(page); - btrfs_csum_final(*crc, (char *)crc); - crc++; + /* Generate bogus crc value */ + if (index == 0) { + u32 *tmp; + crc = btrfs_csum_data(root, orig + sizeof(u64), crc, + PAGE_CACHE_SIZE - sizeof(u64)); + btrfs_csum_final(crc, (char *)&crc); + crc++; + tmp = orig; + *tmp = crc; + } + + kunmap(page); bytes += PAGE_CACHE_SIZE; @@ -779,11 +749,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, addr = kmap(page); memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); - *crc = ~(u32)0; - *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); kunmap(page); - btrfs_csum_final(*crc, (char *)crc); - crc++; bytes += PAGE_CACHE_SIZE; list_del_init(&entry->list); @@ -796,7 +762,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, i_size_read(inode) - 1, &cached_state, GFP_NOFS); ret = 0; - goto out_free; + goto out; } /* Zero out the rest of the pages just to make sure */ @@ -811,20 +777,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, index++; } - /* Write the checksums and trans id to the first page */ - { - void *addr; - u64 *gen; - - page = pages[0]; - - addr = kmap(page); - memcpy(addr, checksums, sizeof(u32) * num_pages); - gen = addr + (sizeof(u32) * num_pages); - *gen = trans->transid; - kunmap(page); - } - ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, bytes, &cached_state); btrfs_drop_pages(pages, num_pages); @@ -833,7 +785,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (ret) { ret = 0; - goto out_free; + goto out; } BTRFS_I(inode)->generation = trans->transid; @@ -850,7 +802,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); - goto out_free; + goto out; } leaf = path->nodes[0]; if (ret > 0) { @@ -866,7 +818,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); btrfs_release_path(path); - goto out_free; + goto out; } } header = btrfs_item_ptr(leaf, path->slots[0], @@ -879,11 +831,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = 1; -out_free: - kfree(checksums); +out: kfree(pages); - -out_update: if (ret != 1) { invalidate_inode_pages2_range(inode->i_mapping, 0, index); BTRFS_I(inode)->generation = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2548a04a023..13e6255182e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -750,15 +750,6 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, return alloc_hint; } -static inline bool is_free_space_inode(struct btrfs_root *root, - struct inode *inode) -{ - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; - return false; -} - /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@ -791,7 +782,7 @@ static noinline int cow_file_range(struct inode *inode, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; - BUG_ON(is_free_space_inode(root, inode)); + BUG_ON(btrfs_is_free_space_inode(root, inode)); trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1072,7 +1063,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, path = btrfs_alloc_path(); BUG_ON(!path); - nolock = is_free_space_inode(root, inode); + nolock = btrfs_is_free_space_inode(root, inode); if (nolock) trans = btrfs_join_transaction_nolock(root); @@ -1298,7 +1289,9 @@ static int btrfs_split_extent_hook(struct inode *inode, if (!(orig->state & EXTENT_DELALLOC)) return 0; - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); return 0; } @@ -1316,7 +1309,9 @@ static int btrfs_merge_extent_hook(struct inode *inode, if (!(other->state & EXTENT_DELALLOC)) return 0; - atomic_dec(&BTRFS_I(inode)->outstanding_extents); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); return 0; } @@ -1337,12 +1332,15 @@ static int btrfs_set_bit_hook(struct inode *inode, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !is_free_space_inode(root, inode); + bool do_list = !btrfs_is_free_space_inode(root, inode); - if (*bits & EXTENT_FIRST_DELALLOC) + if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - else - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + } else { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += len; @@ -1370,12 +1368,15 @@ static int btrfs_clear_bit_hook(struct inode *inode, if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !is_free_space_inode(root, inode); + bool do_list = !btrfs_is_free_space_inode(root, inode); - if (*bits & EXTENT_FIRST_DELALLOC) + if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - else if (!(*bits & EXTENT_DO_ACCOUNTING)) - atomic_dec(&BTRFS_I(inode)->outstanding_extents); + } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); + } if (*bits & EXTENT_DO_ACCOUNTING) btrfs_delalloc_release_metadata(inode, len); @@ -1477,7 +1478,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (is_free_space_inode(root, inode)) + if (btrfs_is_free_space_inode(root, inode)) ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); else ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); @@ -1726,7 +1727,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) return 0; BUG_ON(!ordered_extent); - nolock = is_free_space_inode(root, inode); + nolock = btrfs_is_free_space_inode(root, inode); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); @@ -2531,13 +2532,6 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - if (!leaf->map_token) - map_private_extent_buffer(leaf, (unsigned long)inode_item, - sizeof(struct btrfs_inode_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); inode->i_uid = btrfs_inode_uid(leaf, inode_item); @@ -2575,11 +2569,6 @@ cache_acl: if (!maybe_acls) cache_no_acl(inode); - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - btrfs_free_path(path); switch (inode->i_mode & S_IFMT) { @@ -2624,13 +2613,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - if (!leaf->map_token) - map_private_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_inode_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - btrfs_set_inode_uid(leaf, item, inode->i_uid); btrfs_set_inode_gid(leaf, item, inode->i_gid); btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); @@ -2659,11 +2641,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); btrfs_set_inode_block_group(leaf, item, 0); - - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } } /* @@ -2684,7 +2661,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * The data relocation inode should also be directly updated * without delay */ - if (!is_free_space_inode(root, inode) + if (!btrfs_is_free_space_inode(root, inode) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) @@ -3398,7 +3375,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) ret = -ENOMEM; again: - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) { btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; @@ -3634,7 +3611,7 @@ void btrfs_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || - is_free_space_inode(root, inode))) + btrfs_is_free_space_inode(root, inode))) goto no_delete; if (is_bad_inode(inode)) { @@ -4271,7 +4248,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (BTRFS_I(inode)->dummy_inode) return 0; - if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) + if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) nolock = true; if (wbc->sync_mode == WB_SYNC_ALL) { @@ -4467,7 +4444,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_generation = BTRFS_I(inode)->generation; btrfs_set_inode_space_info(root, inode); - if (mode & S_IFDIR) + if (S_ISDIR(mode)) owner = 0; else owner = 1; @@ -4512,7 +4489,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); - if ((mode & S_IFREG)) { + if (S_ISREG(mode)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW) || @@ -6728,8 +6705,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; - atomic_set(&ei->outstanding_extents, 0); - atomic_set(&ei->reserved_extents, 0); + spin_lock_init(&ei->lock); + ei->outstanding_extents = 0; + ei->reserved_extents = 0; ei->ordered_data_close = 0; ei->orphan_meta_reserved = 0; @@ -6767,8 +6745,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); - WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); - WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); + WARN_ON(BTRFS_I(inode)->outstanding_extents); + WARN_ON(BTRFS_I(inode)->reserved_extents); /* * This can happen where we create an inode, but somebody else also @@ -6823,7 +6801,7 @@ int btrfs_drop_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0 && - !is_free_space_inode(root, inode)) + !btrfs_is_free_space_inode(root, inode)) return 1; else return generic_drop_inode(inode); @@ -7351,12 +7329,12 @@ static const struct inode_operations btrfs_dir_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct file_operations btrfs_dir_file_operations = { @@ -7425,7 +7403,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, @@ -7435,7 +7413,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, @@ -7447,7 +7425,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .check_acl = btrfs_check_acl, + .get_acl = btrfs_get_acl, }; const struct dentry_operations btrfs_dentry_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 622543309eb..0b980afc5ed 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -859,8 +859,8 @@ again: /* step one, lock all the pages */ for (i = 0; i < num_pages; i++) { struct page *page; - page = grab_cache_page(inode->i_mapping, - start_index + i); + page = find_or_create_page(inode->i_mapping, + start_index + i, GFP_NOFS); if (!page) break; @@ -930,7 +930,9 @@ again: GFP_NOFS); if (i_done != num_pages) { - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, (num_pages - i_done) << PAGE_CACHE_SHIFT); } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 66fa43dc3f0..d77b67c4b27 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -24,185 +24,197 @@ #include "extent_io.h" #include "locking.h" -static inline void spin_nested(struct extent_buffer *eb) -{ - spin_lock(&eb->lock); -} +void btrfs_assert_tree_read_locked(struct extent_buffer *eb); /* - * Setting a lock to blocking will drop the spinlock and set the - * flag that forces other procs who want the lock to wait. After - * this you can safely schedule with the lock held. + * if we currently have a spinning reader or writer lock + * (indicated by the rw flag) this will bump the count + * of blocking holders and drop the spinlock. */ -void btrfs_set_lock_blocking(struct extent_buffer *eb) +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { - set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); - spin_unlock(&eb->lock); + if (rw == BTRFS_WRITE_LOCK) { + if (atomic_read(&eb->blocking_writers) == 0) { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + btrfs_assert_tree_locked(eb); + atomic_inc(&eb->blocking_writers); + write_unlock(&eb->lock); + } + } else if (rw == BTRFS_READ_LOCK) { + btrfs_assert_tree_read_locked(eb); + atomic_inc(&eb->blocking_readers); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + read_unlock(&eb->lock); } - /* exit with the spin lock released and the bit set */ + return; } /* - * clearing the blocking flag will take the spinlock again. - * After this you can't safely schedule + * if we currently have a blocking lock, take the spinlock + * and drop our blocking count */ -void btrfs_clear_lock_blocking(struct extent_buffer *eb) +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { - spin_nested(eb); - clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); - smp_mb__after_clear_bit(); + if (rw == BTRFS_WRITE_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_writers) != 1); + write_lock(&eb->lock); + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + if (atomic_dec_and_test(&eb->blocking_writers)) + wake_up(&eb->write_lock_wq); + } else if (rw == BTRFS_READ_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_readers) == 0); + read_lock(&eb->lock); + atomic_inc(&eb->spinning_readers); + if (atomic_dec_and_test(&eb->blocking_readers)) + wake_up(&eb->read_lock_wq); } - /* exit with the spin lock held */ + return; } /* - * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for very long, and often they don't block - * at all. For a dbench 50 run, if we don't spin on the blocking bit - * at all, the context switch rate can jump up to 400,000/sec or more. - * - * So, we're still stuck with this crummy spin on the blocking bit, - * at least until the most common causes of the short blocks - * can be dealt with. + * take a spinning read lock. This will wait for any blocking + * writers */ -static int btrfs_spin_on_block(struct extent_buffer *eb) +void btrfs_tree_read_lock(struct extent_buffer *eb) { - int i; - - for (i = 0; i < 512; i++) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - if (need_resched()) - break; - cpu_relax(); +again: + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; } - return 0; + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); } /* - * This is somewhat different from trylock. It will take the - * spinlock but if it finds the lock is set to blocking, it will - * return without the lock held. - * - * returns 1 if it was able to take the lock and zero otherwise - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers */ -int btrfs_try_spin_lock(struct extent_buffer *eb) +int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - int i; + if (atomic_read(&eb->blocking_writers)) + return 0; - if (btrfs_spin_on_block(eb)) { - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + return 0; } - /* spin for a bit on the BLOCKING flag */ - for (i = 0; i < 2; i++) { - cpu_relax(); - if (!btrfs_spin_on_block(eb)) - break; - - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); - } - return 0; + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); + return 1; } /* - * the autoremove wake function will return 0 if it tried to wake up - * a process that was already awake, which means that process won't - * count as an exclusive wakeup. The waitq code will continue waking - * procs until it finds one that was actually sleeping. - * - * For btrfs, this isn't quite what we want. We want a single proc - * to be notified that the lock is ready for taking. If that proc - * already happen to be awake, great, it will loop around and try for - * the lock. - * - * So, btrfs_wake_function always returns 1, even when the proc that we - * tried to wake up was already awake. + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers or readers */ -static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, - int sync, void *key) +int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - autoremove_wake_function(wait, mode, sync, key); + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) + return 0; + write_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + return 0; + } + atomic_inc(&eb->write_locks); + atomic_inc(&eb->spinning_writers); return 1; } /* - * returns with the extent buffer spinlocked. - * - * This will spin and/or wait as required to take the lock, and then - * return with the spinlock held. - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * drop a spinning read lock + */ +void btrfs_tree_read_unlock(struct extent_buffer *eb) +{ + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + atomic_dec(&eb->read_locks); + read_unlock(&eb->lock); +} + +/* + * drop a blocking read lock + */ +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) +{ + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->blocking_readers) == 0); + if (atomic_dec_and_test(&eb->blocking_readers)) + wake_up(&eb->read_lock_wq); + atomic_dec(&eb->read_locks); +} + +/* + * take a spinning write lock. This will wait for both + * blocking readers or writers */ int btrfs_tree_lock(struct extent_buffer *eb) { - DEFINE_WAIT(wait); - wait.func = btrfs_wake_function; - - if (!btrfs_spin_on_block(eb)) - goto sleep; - - while(1) { - spin_nested(eb); - - /* nobody is blocking, exit with the spinlock held */ - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 0; - - /* - * we have the spinlock, but the real owner is blocking. - * wait for them - */ - spin_unlock(&eb->lock); - - /* - * spin for a bit, and if the blocking flag goes away, - * loop around - */ - cpu_relax(); - if (btrfs_spin_on_block(eb)) - continue; -sleep: - prepare_to_wait_exclusive(&eb->lock_wq, &wait, - TASK_UNINTERRUPTIBLE); - - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - schedule(); - - finish_wait(&eb->lock_wq, &wait); +again: + wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + write_lock(&eb->lock); + if (atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + wait_event(eb->read_lock_wq, + atomic_read(&eb->blocking_readers) == 0); + goto again; } + if (atomic_read(&eb->blocking_writers)) { + write_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; + } + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + atomic_inc(&eb->write_locks); return 0; } +/* + * drop a spinning or a blocking write lock. + */ int btrfs_tree_unlock(struct extent_buffer *eb) { - /* - * if we were a blocking owner, we don't have the spinlock held - * just clear the bit and look for waiters - */ - if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - smp_mb__after_clear_bit(); - else - spin_unlock(&eb->lock); - - if (waitqueue_active(&eb->lock_wq)) - wake_up(&eb->lock_wq); + int blockers = atomic_read(&eb->blocking_writers); + + BUG_ON(blockers > 1); + + btrfs_assert_tree_locked(eb); + atomic_dec(&eb->write_locks); + + if (blockers) { + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_dec(&eb->blocking_writers); + smp_wmb(); + wake_up(&eb->write_lock_wq); + } else { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + write_unlock(&eb->lock); + } return 0; } void btrfs_assert_tree_locked(struct extent_buffer *eb) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - assert_spin_locked(&eb->lock); + BUG_ON(!atomic_read(&eb->write_locks)); +} + +void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + BUG_ON(!atomic_read(&eb->read_locks)); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5c33a560a2f..17247ddb81a 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -19,11 +19,43 @@ #ifndef __BTRFS_LOCKING_ #define __BTRFS_LOCKING_ +#define BTRFS_WRITE_LOCK 1 +#define BTRFS_READ_LOCK 2 +#define BTRFS_WRITE_LOCK_BLOCKING 3 +#define BTRFS_READ_LOCK_BLOCKING 4 + int btrfs_tree_lock(struct extent_buffer *eb); int btrfs_tree_unlock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); -void btrfs_set_lock_blocking(struct extent_buffer *eb); -void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_unlock(struct extent_buffer *eb); +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); void btrfs_assert_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_read_lock(struct extent_buffer *eb); +int btrfs_try_tree_write_lock(struct extent_buffer *eb); + +static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) +{ + if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) + btrfs_tree_unlock(eb); + else if (rw == BTRFS_READ_LOCK_BLOCKING) + btrfs_tree_read_unlock_blocking(eb); + else if (rw == BTRFS_READ_LOCK) + btrfs_tree_read_unlock(eb); + else + BUG(); +} + +static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); +} + +static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ + btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); +} #endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5e0a3dc79a4..59bb1764273 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2955,7 +2955,8 @@ static int relocate_file_extent_cluster(struct inode *inode, page_cache_sync_readahead(inode->i_mapping, ra, NULL, index, last_index + 1 - index); - page = grab_cache_page(inode->i_mapping, index); + page = find_or_create_page(inode->i_mapping, index, + GFP_NOFS); if (!page) { btrfs_delalloc_release_metadata(inode, PAGE_CACHE_SIZE); diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index c0f7ecaf1e7..bc1f6ad1844 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -50,36 +50,22 @@ u##bits btrfs_##name(struct extent_buffer *eb, \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ type *p; \ - /* ugly, but we want the fast path here */ \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - p = (type *)(eb->kaddr + part_offset - eb->map_start); \ - return le##bits##_to_cpu(p->member); \ - } \ - { \ - int err; \ - char *map_token; \ - char *kaddr; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long map_start; \ - unsigned long map_len; \ - u##bits res; \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - if (err) { \ - __le##bits leres; \ - read_eb_member(eb, s, type, member, &leres); \ - return le##bits##_to_cpu(leres); \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - res = le##bits##_to_cpu(p->member); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ - return res; \ - } \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + u##bits res; \ + err = map_private_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits leres; \ + read_eb_member(eb, s, type, member, &leres); \ + return le##bits##_to_cpu(leres); \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + res = le##bits##_to_cpu(p->member); \ + return res; \ } \ void btrfs_set_##name(struct extent_buffer *eb, \ type *s, u##bits val) \ @@ -87,36 +73,21 @@ void btrfs_set_##name(struct extent_buffer *eb, \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ type *p; \ - /* ugly, but we want the fast path here */ \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - p = (type *)(eb->kaddr + part_offset - eb->map_start); \ - p->member = cpu_to_le##bits(val); \ - return; \ - } \ - { \ - int err; \ - char *map_token; \ - char *kaddr; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long map_start; \ - unsigned long map_len; \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - if (err) { \ - __le##bits val2; \ - val2 = cpu_to_le##bits(val); \ - write_eb_member(eb, s, type, member, &val2); \ - return; \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - p->member = cpu_to_le##bits(val); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ - } \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + err = map_private_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits val2; \ + val2 = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val2); \ + return; \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + p->member = cpu_to_le##bits(val); \ } #include "ctree.h" @@ -125,15 +96,6 @@ void btrfs_node_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr = btrfs_node_key_ptr_offset(nr); - if (eb->map_token && ptr >= eb->map_start && - ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { - memcpy(disk_key, eb->kaddr + ptr - eb->map_start, - sizeof(*disk_key)); - return; - } else if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, KM_USER1); - eb->map_token = NULL; - } read_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 51dcec86757..eb55863bb4a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -260,7 +260,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; - int retries = 0; + u64 num_bytes = 0; int ret; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) @@ -274,6 +274,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->block_rsv = NULL; goto got_it; } + + /* + * Do the reservation before we join the transaction so we can do all + * the appropriate flushing if need be. + */ + if (num_items > 0 && root != root->fs_info->chunk_root) { + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + ret = btrfs_block_rsv_add(NULL, root, + &root->fs_info->trans_block_rsv, + num_bytes); + if (ret) + return ERR_PTR(ret); + } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -310,24 +323,9 @@ again: goto again; } - if (num_items > 0) { - ret = btrfs_trans_reserve_metadata(h, root, num_items); - if (ret == -EAGAIN && !retries) { - retries++; - btrfs_commit_transaction(h, root); - goto again; - } else if (ret == -EAGAIN) { - /* - * We have already retried and got EAGAIN, so really we - * don't have space, so set ret to -ENOSPC. - */ - ret = -ENOSPC; - } - - if (ret < 0) { - btrfs_end_transaction(h, root); - return ERR_PTR(ret); - } + if (num_bytes) { + h->block_rsv = &root->fs_info->trans_block_rsv; + h->bytes_reserved = num_bytes; } got_it: @@ -499,10 +497,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } if (lock && cur_trans->blocked && !cur_trans->in_commit) { - if (throttle) + if (throttle) { + /* + * We may race with somebody else here so end up having + * to call end_transaction on ourselves again, so inc + * our use_count. + */ + trans->use_count++; return btrfs_commit_transaction(trans, root); - else + } else { wake_up_process(info->transaction_kthread); + } } WARN_ON(cur_trans != info->running_transaction); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ce8a9f41d1..ac278dd8317 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1730,8 +1730,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_read_buffer(next, ptr_gen); btrfs_tree_lock(next); - clean_tree_block(trans, root, next); btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1796,8 +1796,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; btrfs_tree_lock(next); - clean_tree_block(trans, root, next); btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1864,8 +1864,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; btrfs_tree_lock(next); - clean_tree_block(trans, log, next); btrfs_set_lock_blocking(next); + clean_tree_block(trans, log, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19450bc5363..b89e372c754 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3595,7 +3595,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); - btrfs_set_buffer_lockdep_class(sb, 0); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5366fe452ab..d733b9cfea3 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -102,43 +102,57 @@ static int do_setxattr(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - /* first lets see if we already have this xattr */ - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - strlen(name), -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } - - /* ok we already have this xattr, lets remove it */ - if (di) { - /* if we want create only exit */ - if (flags & XATTR_CREATE) { - ret = -EEXIST; + if (flags & XATTR_REPLACE) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, + name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + ret = -ENODATA; goto out; } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); + if (ret) + goto out; btrfs_release_path(path); + } - /* if we don't have a value then we are removing the xattr */ - if (!value) +again: + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + if (ret == -EEXIST) { + if (flags & XATTR_CREATE) goto out; - } else { + /* + * We can't use the path we already have since we won't have the + * proper locking for a delete, so release the path and + * re-lookup to delete the thing. + */ btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + /* Shouldn't happen but just in case... */ + btrfs_release_path(path); + goto again; + } - if (flags & XATTR_REPLACE) { - /* we couldn't find the attr to replace */ - ret = -ENODATA; + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) goto out; + + /* + * We have a value to set, so go back and try to insert it now. + */ + if (value) { + btrfs_release_path(path); + goto again; } } - - /* ok we have to create a completely new xattr */ - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), - name, name_len, value, size); - BUG_ON(ret); out: btrfs_free_path(path); return ret; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 0dba6915712..fb962efdace 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -102,7 +102,7 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry->d_parent->d_inode), + ceph_ino(req->r_old_dentry_dir), req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, path ? path : ""); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1065ac77984..382abc9a6a5 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -40,14 +40,6 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ - ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) - d_set_d_op(dentry, &ceph_dentry_ops); - else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) - d_set_d_op(dentry, &ceph_snapdir_dentry_ops); - else - d_set_d_op(dentry, &ceph_snap_dentry_ops); - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ @@ -58,16 +50,42 @@ int ceph_init_dentry(struct dentry *dentry) kmem_cache_free(ceph_dentry_cachep, di); goto out_unlock; } + + if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ + ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + d_set_d_op(dentry, &ceph_dentry_ops); + else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + d_set_d_op(dentry, &ceph_snapdir_dentry_ops); + else + d_set_d_op(dentry, &ceph_snap_dentry_ops); + di->dentry = dentry; di->lease_session = NULL; - dentry->d_fsdata = di; dentry->d_time = jiffies; + /* avoid reordering d_fsdata setup so that the check above is safe */ + smp_mb(); + dentry->d_fsdata = di; ceph_dentry_lru_add(dentry); out_unlock: spin_unlock(&dentry->d_lock); return 0; } +struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) +{ + struct inode *inode = NULL; + + if (!dentry) + return NULL; + + spin_lock(&dentry->d_lock); + if (dentry->d_parent) { + inode = dentry->d_parent->d_inode; + ihold(inode); + } + spin_unlock(&dentry->d_lock); + return inode; +} /* @@ -133,7 +151,7 @@ more: d_unhashed(dentry) ? "!hashed" : "hashed", parent->d_subdirs.prev, parent->d_subdirs.next); if (p == &parent->d_subdirs) { - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; goto out_unlock; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); @@ -234,7 +252,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); - if (fi->at_end) + if (fi->flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ @@ -403,7 +421,7 @@ more: dout("readdir next frag is %x\n", frag); goto more; } - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; /* * if dir_release_count still matches the dir, no dentries @@ -435,7 +453,7 @@ static void reset_readdir(struct ceph_file_info *fi) dput(fi->dentry); fi->dentry = NULL; } - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) @@ -463,7 +481,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } retval = offset; @@ -488,21 +506,13 @@ out: } /* - * Process result of a lookup/open request. - * - * Mainly, make sure we return the final req->r_dentry (if it already - * existed) in place of the original VFS-provided dentry when they - * differ. - * - * Gracefully handle the case where the MDS replies with -ENOENT and - * no trace (which it may do, at its discretion, e.g., if it doesn't - * care to issue a lease on the negative dentry). + * Handle lookups for the hidden .snap directory. */ -struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, - struct dentry *dentry, int err) +int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = dentry->d_parent->d_inode; + struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ /* .snap dir? */ if (err == -ENOENT && @@ -516,7 +526,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, d_add(dentry, inode); err = 0; } + return err; +} +/* + * Figure out final result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ if (err == -ENOENT) { /* no trace? */ err = 0; @@ -610,6 +636,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_locked_dir = dir; err = ceph_mdsc_do_request(mdsc, NULL, req); + err = ceph_handle_snapdir(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ dout("lookup result=%p\n", dentry); @@ -789,6 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ + req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -887,6 +915,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); + req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); req->r_locked_dir = new_dir; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -1002,36 +1031,38 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) */ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { + int valid = 0; struct inode *dir; if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; - dir = dentry->d_parent->d_inode; - dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode, ceph_dentry(dentry)->offset); + dir = ceph_get_dentry_parent_inode(dentry); + /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - goto out_touch; + valid = 1; + } else if (dentry->d_inode && + ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { + valid = 1; + } else if (dentry_lease_is_valid(dentry) || + dir_lease_is_valid(dir, dentry)) { + valid = 1; } - if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) - goto out_touch; - - if (dentry_lease_is_valid(dentry) || - dir_lease_is_valid(dir, dentry)) - goto out_touch; - dout("d_revalidate %p invalid\n", dentry); - d_drop(dentry); - return 0; -out_touch: - ceph_dentry_lru_touch(dentry); - return 1; + dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); + if (valid) + ceph_dentry_lru_touch(dentry); + else + d_drop(dentry); + iput(dir); + return valid; } /* @@ -1228,9 +1259,8 @@ void ceph_dentry_lru_del(struct dentry *dn) * Return name hash for a given dentry. This is dependent on * the parent directory's hash function. */ -unsigned ceph_dentry_hash(struct dentry *dn) +unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { - struct inode *dir = dn->d_parent->d_inode; struct ceph_inode_info *dci = ceph_inode(dir); switch (dci->i_dir_layout.dl_dir_hash) { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index f67b687550d..9fbcdecaacc 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -46,7 +46,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, int type; struct ceph_nfs_fh *fh = (void *)rawfh; struct ceph_nfs_confh *cfh = (void *)rawfh; - struct dentry *parent = dentry->d_parent; + struct dentry *parent; struct inode *inode = dentry->d_inode; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; @@ -55,26 +55,33 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; + spin_lock(&dentry->d_lock); + parent = dget(dentry->d_parent); + spin_unlock(&dentry->d_lock); + if (*max_len >= connected_handle_length) { dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = ceph_dentry_hash(parent); + cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, + dentry); *max_len = connected_handle_length; type = 2; } else if (*max_len >= handle_length) { if (connectable) { *max_len = connected_handle_length; - return 255; + type = 255; + } else { + dout("encode_fh %p\n", dentry); + fh->ino = ceph_ino(dentry->d_inode); + *max_len = handle_length; + type = 1; } - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(dentry->d_inode); - *max_len = handle_length; - type = 1; } else { *max_len = handle_length; - return 255; + type = 255; } + dput(parent); return type; } @@ -123,7 +130,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, return dentry; } err = ceph_init_dentry(dentry); - if (err < 0) { iput(inode); return ERR_PTR(err); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0d0eae05598..ce549d31eeb 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -122,7 +122,7 @@ int ceph_open(struct inode *inode, struct file *file) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode = NULL; int err; int flags, fmode, wanted; @@ -194,7 +194,10 @@ int ceph_open(struct inode *inode, struct file *file) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; + if (flags & (O_CREAT|O_TRUNC)) + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); @@ -222,9 +225,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct file *file = nd->intent.open.file; - struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); + struct file *file; struct ceph_mds_request *req; + struct dentry *ret; int err; int flags = nd->intent.open.flags; @@ -242,16 +245,24 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; } req->r_locked_dir = dir; /* caller holds dir->i_mutex */ - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - dentry = ceph_finish_lookup(req, dentry, err); - if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_mdsc_do_request(mdsc, + (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, + req); + err = ceph_handle_snapdir(req, dentry, err); + if (err) + goto out; + if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (!err) - err = ceph_init_file(req->r_dentry->d_inode, file, - req->r_fmode); + if (err) + goto out; + file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open); + if (IS_ERR(file)) + err = PTR_ERR(file); +out: + ret = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); - dout("ceph_lookup_open result=%p\n", dentry); - return dentry; + dout("ceph_lookup_open result=%p\n", ret); + return ret; } int ceph_release(struct inode *inode, struct file *file) @@ -643,7 +654,8 @@ again: if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) /* hmm, this isn't really async... */ ret = ceph_sync_read(filp, base, len, ppos, &checkeof); else @@ -712,7 +724,7 @@ retry_snap: want = CEPH_CAP_FILE_BUFFER; ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); if (ret < 0) - goto out; + goto out_put; dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, @@ -720,12 +732,23 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) { ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + /* + * buffered write; drop Fw early to avoid slow + * revocation if we get stuck on balance_dirty_pages + */ + int dirty; + spin_lock(&inode->i_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&inode->i_lock); + ceph_put_cap_refs(ci, got); + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { @@ -733,7 +756,12 @@ retry_snap: if (err < 0) ret = err; } + + if (dirty) + __mark_inode_dirty(inode, dirty); + goto out; } + if (ret >= 0) { int dirty; spin_lock(&inode->i_lock); @@ -743,12 +771,13 @@ retry_snap: __mark_inode_dirty(inode, dirty); } -out: +out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); +out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index dfb2831d8d8..095799ba9dd 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -560,7 +560,8 @@ static int fill_inode(struct inode *inode, struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int i; - int issued, implemented; + int issued = 0, implemented; + int updating_inode = 0; struct timespec mtime, atime, ctime; u32 nsplits; struct ceph_buffer *xattr_blob = NULL; @@ -599,7 +600,8 @@ static int fill_inode(struct inode *inode, if (le64_to_cpu(info->version) > 0 && (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; - + + updating_inode = 1; issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); @@ -707,17 +709,6 @@ static int fill_inode(struct inode *inode, ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); ceph_decode_timespec(&ci->i_rctime, &info->rctime); - - /* set dir completion flag? */ - if (ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { - dout(" marking %p complete (empty)\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ - ci->i_max_offset = 2; - } break; default: pr_err("fill_inode %llx.%llx BAD mode 0%o\n", @@ -774,6 +765,19 @@ no_change: __ceph_get_fmode(ci, cap_fmode); } + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + updating_inode && /* didn't jump to no_change */ + ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + dout(" marking %p complete (empty)\n", inode); + /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ci->i_max_offset = 2; + } + /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); @@ -805,14 +809,14 @@ static void update_dentry_lease(struct dentry *dentry, return; spin_lock(&dentry->d_lock); - dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n", - dentry, le16_to_cpu(lease->mask), duration, ttl); + dout("update_dentry_lease %p duration %lu ms ttl %lu\n", + dentry, duration, ttl); /* make lease_rdcache_gen match directory */ dir = dentry->d_parent->d_inode; di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; - if (lease->mask == 0) + if (duration == 0) goto out_unlock; if (di->lease_gen == session->s_cap_gen && @@ -839,11 +843,13 @@ out_unlock: /* * Set dentry's directory position based on the current dir's max, and * order it in d_subdirs, so that dcache_readdir behaves. + * + * Always called under directory's i_mutex. */ static void ceph_set_dentry_offset(struct dentry *dn) { struct dentry *dir = dn->d_parent; - struct inode *inode = dn->d_parent->d_inode; + struct inode *inode = dir->d_inode; struct ceph_dentry_info *di; BUG_ON(!inode); @@ -1022,9 +1028,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* do we have a dn lease? */ have_lease = have_dir_cap || - (le16_to_cpu(rinfo->dlease->mask) & - CEPH_LOCK_DN); - + le32_to_cpu(rinfo->dlease->duration_ms); if (!have_lease) dout("fill_trace no dentry lease or dir cap\n"); @@ -1560,7 +1564,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1743,7 +1747,9 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index ef0b5f48e13..3b256b50f7d 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -38,7 +38,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file->f_dentry->d_inode; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; @@ -87,7 +87,9 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32(l.preferred_osd); + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } @@ -231,6 +233,14 @@ static long ceph_ioctl_lazyio(struct file *file) return 0; } +static long ceph_ioctl_syncio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + + fi->flags |= CEPH_F_SYNC; + return 0; +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); @@ -249,6 +259,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_LAZYIO: return ceph_ioctl_lazyio(file); + + case CEPH_IOC_SYNCIO: + return ceph_ioctl_syncio(file); } return -ENOTTY; diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 52e8fd74d45..0c5167e4318 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -40,5 +40,6 @@ struct ceph_ioctl_dataloc { struct ceph_ioctl_dataloc) #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) +#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) #endif diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0c1d9175652..fee028b5332 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -483,22 +483,26 @@ void ceph_mdsc_release_request(struct kref *kref) destroy_reply_info(&req->r_reply_info); } if (req->r_inode) { - ceph_put_cap_refs(ceph_inode(req->r_inode), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } if (req->r_locked_dir) - ceph_put_cap_refs(ceph_inode(req->r_locked_dir), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); if (req->r_target_inode) iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) { - ceph_put_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + /* + * track (and drop pins for) r_old_dentry_dir + * separately, since r_old_dentry's d_parent may have + * changed between the dir mutex being dropped and + * this request being freed. + */ + ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); dput(req->r_old_dentry); + iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); @@ -617,6 +621,12 @@ static void __unregister_request(struct ceph_mds_client *mdsc, */ struct dentry *get_nonsnap_parent(struct dentry *dentry) { + /* + * we don't need to worry about protecting the d_parent access + * here because we never renaming inside the snapped namespace + * except to resplice to another snapdir, and either the old or new + * result is a valid result. + */ while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) dentry = dentry->d_parent; return dentry; @@ -652,7 +662,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_inode) { inode = req->r_inode; } else if (req->r_dentry) { - struct inode *dir = req->r_dentry->d_parent->d_inode; + /* ignore race with rename; old or new d_parent is okay */ + struct dentry *parent = req->r_dentry->d_parent; + struct inode *dir = parent->d_inode; if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ @@ -660,8 +672,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ - struct dentry *dn = - get_nonsnap_parent(req->r_dentry->d_parent); + struct dentry *dn = get_nonsnap_parent(parent); inode = dn->d_inode; dout("__choose_mds using nonsnap parent %p\n", inode); } else if (req->r_dentry->d_inode) { @@ -670,7 +681,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else { /* dir + name */ inode = dir; - hash = ceph_dentry_hash(req->r_dentry); + hash = ceph_dentry_hash(dir, req->r_dentry); is_hash = true; } } @@ -1931,9 +1942,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, if (req->r_locked_dir) ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); if (req->r_old_dentry) - ceph_get_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); /* issue */ mutex_lock(&mdsc->mutex); @@ -2714,7 +2724,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_lease *h = msg->front.iov_base; u32 seq; struct ceph_vino vino; - int mask; struct qstr dname; int release = 0; @@ -2725,7 +2734,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, goto bad; vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; - mask = le16_to_cpu(h->mask); seq = le32_to_cpu(h->seq); dname.name = (void *)h + sizeof(*h) + sizeof(u32); dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); @@ -2737,8 +2745,8 @@ static void handle_lease(struct ceph_mds_client *mdsc, /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", - ceph_lease_op_name(h->action), mask, vino.ino, inode, + dout("handle_lease %s, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), vino.ino, inode, dname.len, dname.name); if (inode == NULL) { dout("handle_lease no inode %llx\n", vino.ino); @@ -2828,7 +2836,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, return; lease = msg->front.iov_base; lease->action = action; - lease->mask = cpu_to_le16(1); lease->ino = cpu_to_le64(ceph_vino(inode).ino); lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); lease->seq = cpu_to_le32(seq); @@ -2850,7 +2857,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, * Pass @inode always, @dentry is optional. */ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dentry, int mask) + struct dentry *dentry) { struct ceph_dentry_info *di; struct ceph_mds_session *session; @@ -2858,7 +2865,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, BUG_ON(inode == NULL); BUG_ON(dentry == NULL); - BUG_ON(mask == 0); /* is dentry lease valid? */ spin_lock(&dentry->d_lock); @@ -2868,8 +2874,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, di->lease_gen != di->lease_session->s_cap_gen || !time_before(jiffies, dentry->d_time)) { dout("lease_release inode %p dentry %p -- " - "no lease on %d\n", - inode, dentry, mask); + "no lease\n", + inode, dentry); spin_unlock(&dentry->d_lock); return; } @@ -2880,8 +2886,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, __ceph_mdsc_drop_dentry_lease(dentry); spin_unlock(&dentry->d_lock); - dout("lease_release inode %p dentry %p mask %d to mds%d\n", - inode, dentry, mask, session->s_mds); + dout("lease_release inode %p dentry %p to mds%d\n", + inode, dentry, session->s_mds); ceph_mdsc_lease_send_msg(session, inode, dentry, CEPH_MDS_LEASE_RELEASE, seq); ceph_put_mds_session(session); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 7d8a0d662d5..4bb239921db 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -171,6 +171,7 @@ struct ceph_mds_request { struct inode *r_inode; /* arg1 */ struct dentry *r_dentry; /* arg1 */ struct dentry *r_old_dentry; /* arg2: rename from or link from */ + struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; @@ -333,7 +334,7 @@ extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dn, int mask); + struct dentry *dn); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 54b14de2e72..e2643719133 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -449,6 +449,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + + /* + * If there is a write in progress, treat that as a dirty Fw, + * even though it hasn't completed yet; by the time we finish + * up this capsnap it will be. + */ + if (used & CEPH_CAP_FILE_WR) + dirty |= CEPH_CAP_FILE_WR; + if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any @@ -456,13 +465,19 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); - } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || - (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { + } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { struct ceph_snap_context *snapc = ci->i_head_snapc; - dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, - capsnap, snapc); + /* + * if we are a sync write, we may need to go to the snaprealm + * to get the current snapc. + */ + if (!snapc) + snapc = ci->i_snap_realm->cached_context; + + dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", + inode, capsnap, snapc, ceph_cap_string(dirty)); ihold(inode); atomic_set(&capsnap->nref, 1); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f2f77fd3c14..d47c5ec7fb1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -73,8 +73,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) */ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); - buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> - (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_files = le64_to_cpu(st.num_objects); @@ -780,6 +779,10 @@ static int ceph_register_bdi(struct super_block *sb, fsc->backing_dev_info.ra_pages = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; + else + fsc->backing_dev_info.ra_pages = + default_backing_dev_info.ra_pages; + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", atomic_long_inc_return(&bdi_seq)); if (!err) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 30446b144e3..a23eed526f0 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -543,13 +543,16 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, /* * we keep buffered readdir results attached to file->private_data */ +#define CEPH_F_SYNC 1 +#define CEPH_F_ATEND 2 + struct ceph_file_info { - int fmode; /* initialized on open */ + short fmode; /* initialized on open */ + short flags; /* CEPH_F_* */ /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; - int at_end; /* readdir: position within a frag */ unsigned offset; /* offset of last chunk, adjusted for . and .. */ @@ -789,6 +792,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); @@ -796,7 +801,8 @@ extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); -extern unsigned ceph_dentry_hash(struct dentry *dn); +extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); +extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, @@ -819,14 +825,6 @@ extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, int p_locks, int f_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); -static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) -{ - if (dentry && dentry->d_parent) - return dentry->d_parent->d_inode; - - return NULL; -} - /* debugfs.c */ extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f42d730f1b6..96c6739a028 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -629,7 +629,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; int err; @@ -677,7 +677,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -788,7 +790,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; int err; @@ -802,7 +804,9 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 5a0ee7f2af0..259991bd211 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -52,19 +52,29 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Oould not init md5\n", __func__); + cERROR(1, "%s: Could not init md5\n", __func__); return rc; } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, server->session_key.response, server->session_key.len); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); + if (rc) { + cERROR(1, "%s: Could not update with payload\n", __func__); + return rc; + } rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); - return 0; + return rc; } /* must be called with server->srv_mutex held */ @@ -112,12 +122,16 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Oould not init md5\n", __func__); + cERROR(1, "%s: Could not init md5\n", __func__); return rc; } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, server->session_key.response, server->session_key.len); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) @@ -131,14 +145,24 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, iov[i].iov_base + 4, iov[i].iov_len - 4); - } else + } else { + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cERROR(1, "%s: Could not update with payload\n", + __func__); + return rc; + } } rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -463,8 +487,12 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash); - crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NT Hash as a key", __func__); + return rc; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -478,13 +506,18 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (user == NULL) { cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); rc = -ENOMEM; - goto calc_exit_2; + return rc; } len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); UniStrupr(user); - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)user, 2 * len); + kfree(user); + if (rc) { + cERROR(1, "%s: Could not update with user\n", __func__); + return rc; + } /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { @@ -494,13 +527,19 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (domain == NULL) { cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); rc = -ENOMEM; - goto calc_exit_1; + return rc; } len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, nls_cp); + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)domain, 2 * len); kfree(domain); + if (rc) { + cERROR(1, "%s: Could not update with domain\n", + __func__); + return rc; + } } else if (ses->serverName) { len = strlen(ses->serverName); @@ -508,21 +547,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (server == NULL) { cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); rc = -ENOMEM; - goto calc_exit_1; + return rc; } len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, nls_cp); + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)server, 2 * len); kfree(server); + if (rc) { + cERROR(1, "%s: Could not update with server\n", + __func__); + return rc; + } } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ntlmv2_hash); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); -calc_exit_1: - kfree(user); -calc_exit_2: return rc; } @@ -537,8 +581,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return -1; } - crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + return rc; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -552,11 +600,17 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) else memcpy(ses->auth_key.response + offset, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + offset, ses->auth_key.len - offset); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -626,8 +680,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } /* now calculate the session key for NTLMv2 */ - crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + goto setup_ntlmv2_rsp_ret; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -635,12 +693,18 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto setup_ntlmv2_rsp_ret; } - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + goto setup_ntlmv2_rsp_ret; + } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); setup_ntlmv2_rsp_ret: kfree(tiblob); @@ -668,8 +732,12 @@ calc_seckey(struct cifs_ses *ses) desc.tfm = tfm_arc4; - crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, + rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); + if (rc) { + cERROR(1, "%s: Could not set response as a key", __func__); + return rc; + } sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); @@ -688,7 +756,7 @@ calc_seckey(struct cifs_ses *ses) crypto_free_blkcipher(tfm_arc4); - return 0; + return rc; } void diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 6255fa812c7..1fcf4e5b311 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -501,7 +501,7 @@ struct cifs_search_info { char *ntwrk_buf_start; char *srch_entries_start; char *last_entry; - char *presume_name; + const char *presume_name; unsigned int resume_name_len; bool endOfSearch:1; bool emptyDir:1; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 14d602f178c..ae576fbb514 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -57,11 +57,6 @@ build_path_from_dentry(struct dentry *direntry) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); unsigned seq; - if (direntry == NULL) - return NULL; /* not much we can do if dentry is freed and - we need to reopen the file after it was closed implicitly - when the server crashed */ - dirsep = CIFS_DIR_SEP(cifs_sb); if (tcon->Flags & SMB_SHARE_IS_IN_DFS) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); @@ -641,7 +636,7 @@ lookup_out: static int cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && (nd->flags & LOOKUP_RCU)) return -ECHILD; if (direntry->d_inode) { diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 556b1a0b54d..db3f18cdf02 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -74,8 +74,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) cERROR(1, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } - crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); + if (rc) { + cERROR(1, "%s: Could not update iwth link_str\n", __func__); + goto symlink_hash_err; + } rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: crypto_free_shash(md5); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 965a3af186a..5de03ec2014 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -4,6 +4,7 @@ * Directory search handling * * Copyright (C) International Business Machines Corp., 2004, 2008 + * Copyright (C) Red Hat, Inc., 2011 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -290,10 +291,10 @@ error_exit: } /* return length of unicode string in bytes */ -static int cifs_unicode_bytelen(char *str) +static int cifs_unicode_bytelen(const char *str) { int len; - __le16 *ustr = (__le16 *)str; + const __le16 *ustr = (const __le16 *)str; for (len = 0; len <= PATH_MAX; len++) { if (ustr[len] == 0) @@ -334,78 +335,128 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) } +struct cifs_dirent { + const char *name; + size_t namelen; + u32 resume_key; + u64 ino; +}; + +static void cifs_fill_dirent_unix(struct cifs_dirent *de, + const FILE_UNIX_INFO *info, bool is_unicode) +{ + de->name = &info->FileName[0]; + if (is_unicode) + de->namelen = cifs_unicode_bytelen(de->name); + else + de->namelen = strnlen(de->name, PATH_MAX); + de->resume_key = info->ResumeKey; + de->ino = le64_to_cpu(info->basic.UniqueId); +} + +static void cifs_fill_dirent_dir(struct cifs_dirent *de, + const FILE_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_full(struct cifs_dirent *de, + const FILE_FULL_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_search(struct cifs_dirent *de, + const SEARCH_ID_FULL_DIR_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; + de->ino = le64_to_cpu(info->UniqueId); +} + +static void cifs_fill_dirent_both(struct cifs_dirent *de, + const FILE_BOTH_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_std(struct cifs_dirent *de, + const FIND_FILE_STANDARD_INFO *info) +{ + de->name = &info->FileName[0]; + /* one byte length, no endianess conversion */ + de->namelen = info->FileNameLength; + de->resume_key = info->ResumeKey; +} + +static int cifs_fill_dirent(struct cifs_dirent *de, const void *info, + u16 level, bool is_unicode) +{ + memset(de, 0, sizeof(*de)); + + switch (level) { + case SMB_FIND_FILE_UNIX: + cifs_fill_dirent_unix(de, info, is_unicode); + break; + case SMB_FIND_FILE_DIRECTORY_INFO: + cifs_fill_dirent_dir(de, info); + break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + cifs_fill_dirent_full(de, info); + break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + cifs_fill_dirent_search(de, info); + break; + case SMB_FIND_FILE_BOTH_DIRECTORY_INFO: + cifs_fill_dirent_both(de, info); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_fill_dirent_std(de, info); + break; + default: + cFYI(1, "Unknown findfirst level %d", level); + return -EINVAL; + } + + return 0; +} + #define UNICODE_DOT cpu_to_le16(0x2e) /* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */ -static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile) +static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) { int rc = 0; - char *filename = NULL; - int len = 0; - - if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - filename = &pFindData->FileName[0]; - if (cfile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, 5); - } - } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = pFindData->FileNameLength; - } else { - cFYI(1, "Unknown findfirst level %d", - cfile->srch_inf.info_level); - } - if (filename) { - if (cfile->srch_inf.unicode) { - __le16 *ufilename = (__le16 *)filename; - if (len == 2) { - /* check for . */ - if (ufilename[0] == UNICODE_DOT) - rc = 1; - } else if (len == 4) { - /* check for .. */ - if ((ufilename[0] == UNICODE_DOT) - && (ufilename[1] == UNICODE_DOT)) - rc = 2; - } - } else /* ASCII */ { - if (len == 1) { - if (filename[0] == '.') - rc = 1; - } else if (len == 2) { - if ((filename[0] == '.') && (filename[1] == '.')) - rc = 2; - } + if (!de->name) + return 0; + + if (is_unicode) { + __le16 *ufilename = (__le16 *)de->name; + if (de->namelen == 2) { + /* check for . */ + if (ufilename[0] == UNICODE_DOT) + rc = 1; + } else if (de->namelen == 4) { + /* check for .. */ + if (ufilename[0] == UNICODE_DOT && + ufilename[1] == UNICODE_DOT) + rc = 2; + } + } else /* ASCII */ { + if (de->namelen == 1) { + if (de->name[0] == '.') + rc = 1; + } else if (de->namelen == 2) { + if (de->name[0] == '.' && de->name[1] == '.') + rc = 2; } } @@ -427,66 +478,18 @@ static int is_dir_changed(struct file *file) } static int cifs_save_resume_key(const char *current_entry, - struct cifsFileInfo *cifsFile) + struct cifsFileInfo *file_info) { - int rc = 0; - unsigned int len = 0; - __u16 level; - char *filename; - - if ((cifsFile == NULL) || (current_entry == NULL)) - return -EINVAL; - - level = cifsFile->srch_inf.info_level; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; + struct cifs_dirent de; + int rc; - filename = &pFindData->FileName[0]; - if (cifsFile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else { - cFYI(1, "Unknown findfirst level %d", level); - return -EINVAL; + rc = cifs_fill_dirent(&de, current_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (!rc) { + file_info->srch_inf.presume_name = de.name; + file_info->srch_inf.resume_name_len = de.namelen; + file_info->srch_inf.resume_key = de.resume_key; } - cifsFile->srch_inf.resume_name_len = len; - cifsFile->srch_inf.presume_name = filename; return rc; } @@ -605,136 +608,70 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, return rc; } -/* inode num, inode type and filename returned */ -static int cifs_get_name_from_search_buf(struct qstr *pqst, - char *current_entry, __u16 level, unsigned int unicode, - struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum) +static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, + void *dirent, char *scratch_buf, unsigned int max_len) { + struct cifsFileInfo *file_info = file->private_data; + struct super_block *sb = file->f_path.dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_dirent de = { NULL, }; + struct cifs_fattr fattr; + struct dentry *dentry; + struct qstr name; int rc = 0; - unsigned int len = 0; - char *filename; - struct nls_table *nlt = cifs_sb->local_nls; - - *pinum = 0; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - - filename = &pFindData->FileName[0]; - if (unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } + ino_t ino; - *pinum = le64_to_cpu(pFindData->basic.UniqueId); - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - *pinum = le64_to_cpu(pFindData->UniqueId); - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - } else { - cFYI(1, "Unknown findfirst level %d", level); - return -EINVAL; - } + rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (rc) + return rc; - if (len > max_len) { - cERROR(1, "bad search response length %d past smb end", len); + if (de.namelen > max_len) { + cERROR(1, "bad search response length %zd past smb end", + de.namelen); return -EINVAL; } - if (unicode) { - pqst->len = cifs_from_ucs2((char *) pqst->name, - (__le16 *) filename, - UNICODE_NAME_MAX, - min(len, max_len), nlt, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - pqst->len -= nls_nullsize(nlt); - } else { - pqst->name = filename; - pqst->len = len; - } - return rc; -} - -static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, - void *direntry, char *scratch_buf, unsigned int max_len) -{ - int rc = 0; - struct qstr qstring; - struct cifsFileInfo *pCifsF; - u64 inum; - ino_t ino; - struct super_block *sb; - struct cifs_sb_info *cifs_sb; - struct dentry *tmp_dentry; - struct cifs_fattr fattr; - - /* get filename and len into qstring */ - /* get dentry */ - /* decide whether to create and populate ionde */ - if ((direntry == NULL) || (file == NULL)) - return -EINVAL; - - pCifsF = file->private_data; - - if ((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL)) - return -ENOENT; - - rc = cifs_entry_is_dot(pfindEntry, pCifsF); /* skip . and .. since we added them first */ - if (rc != 0) + if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode)) return 0; - sb = file->f_path.dentry->d_sb; - cifs_sb = CIFS_SB(sb); - - qstring.name = scratch_buf; - rc = cifs_get_name_from_search_buf(&qstring, pfindEntry, - pCifsF->srch_inf.info_level, - pCifsF->srch_inf.unicode, cifs_sb, - max_len, &inum /* returned */); + if (file_info->srch_inf.unicode) { + struct nls_table *nlt = cifs_sb->local_nls; - if (rc) - return rc; + name.name = scratch_buf; + name.len = + cifs_from_ucs2((char *)name.name, (__le16 *)de.name, + UNICODE_NAME_MAX, + min(de.namelen, (size_t)max_len), nlt, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + name.len -= nls_nullsize(nlt); + } else { + name.name = de.name; + name.len = de.namelen; + } - if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) + switch (file_info->srch_inf.info_level) { + case SMB_FIND_FILE_UNIX: cifs_unix_basic_to_fattr(&fattr, - &((FILE_UNIX_INFO *) pfindEntry)->basic, - cifs_sb); - else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) - cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *) - pfindEntry, cifs_sb); - else - cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) - pfindEntry, cifs_sb); + &((FILE_UNIX_INFO *)find_entry)->basic, + cifs_sb); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_std_info_to_fattr(&fattr, + (FIND_FILE_STANDARD_INFO *)find_entry, + cifs_sb); + break; + default: + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)find_entry, + cifs_sb); + break; + } - if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { - fattr.cf_uniqueid = inum; + if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + fattr.cf_uniqueid = de.ino; } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); cifs_autodisable_serverino(cifs_sb); @@ -750,12 +687,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); + dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr); - rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, - ino, fattr.cf_dtype); + rc = filldir(dirent, name.name, name.len, file->f_pos, ino, + fattr.cf_dtype); - dput(tmp_dentry); + dput(dentry); return rc; } diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 1c5b770c314..42b9fff4875 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -157,8 +157,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) cERROR(1, "%s: Could not init md4 shash\n", __func__); goto mdfour_err; } - crypto_shash_update(&sdescmd4->shash, link_str, link_len); + rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len); + if (rc) { + cERROR(1, "%s: Could not update with link_str\n", __func__); + goto mdfour_err; + } rc = crypto_shash_final(&sdescmd4->shash, md4_hash); + if (rc) + cERROR(1, "%s: Could not genereate md4 hash\n", __func__); mdfour_err: crypto_free_shash(md4); diff --git a/fs/compat.c b/fs/compat.c index 0ea00832de2..0b48d018e38 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1675,256 +1675,10 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, } #endif /* HAVE_SET_RESTORE_SIGMASK */ -#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED) -/* Stuff for NFS server syscalls... */ -struct compat_nfsctl_svc { - u16 svc32_port; - s32 svc32_nthreads; -}; - -struct compat_nfsctl_client { - s8 cl32_ident[NFSCLNT_IDMAX+1]; - s32 cl32_naddr; - struct in_addr cl32_addrlist[NFSCLNT_ADDRMAX]; - s32 cl32_fhkeytype; - s32 cl32_fhkeylen; - u8 cl32_fhkey[NFSCLNT_KEYMAX]; -}; - -struct compat_nfsctl_export { - char ex32_client[NFSCLNT_IDMAX+1]; - char ex32_path[NFS_MAXPATHLEN+1]; - compat_dev_t ex32_dev; - compat_ino_t ex32_ino; - compat_int_t ex32_flags; - __compat_uid_t ex32_anon_uid; - __compat_gid_t ex32_anon_gid; -}; - -struct compat_nfsctl_fdparm { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - compat_int_t gd32_version; -}; - -struct compat_nfsctl_fsparm { - struct sockaddr gd32_addr; - s8 gd32_path[NFS_MAXPATHLEN+1]; - compat_int_t gd32_maxlen; -}; - -struct compat_nfsctl_arg { - compat_int_t ca32_version; /* safeguard */ - union { - struct compat_nfsctl_svc u32_svc; - struct compat_nfsctl_client u32_client; - struct compat_nfsctl_export u32_export; - struct compat_nfsctl_fdparm u32_getfd; - struct compat_nfsctl_fsparm u32_getfs; - } u; -#define ca32_svc u.u32_svc -#define ca32_client u.u32_client -#define ca32_export u.u32_export -#define ca32_getfd u.u32_getfd -#define ca32_getfs u.u32_getfs -}; - -union compat_nfsctl_res { - __u8 cr32_getfh[NFS_FHSIZE]; - struct knfsd_fh cr32_getfs; -}; - -static int compat_nfs_svc_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)) || - get_user(karg->ca_version, &arg->ca32_version) || - __get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port) || - __get_user(karg->ca_svc.svc_nthreads, - &arg->ca32_svc.svc32_nthreads)) - return -EFAULT; - return 0; -} - -static int compat_nfs_clnt_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ, &arg->ca32_client, - sizeof(arg->ca32_client)) || - get_user(karg->ca_version, &arg->ca32_version) || - __copy_from_user(&karg->ca_client.cl_ident[0], - &arg->ca32_client.cl32_ident[0], - NFSCLNT_IDMAX) || - __get_user(karg->ca_client.cl_naddr, - &arg->ca32_client.cl32_naddr) || - __copy_from_user(&karg->ca_client.cl_addrlist[0], - &arg->ca32_client.cl32_addrlist[0], - (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)) || - __get_user(karg->ca_client.cl_fhkeytype, - &arg->ca32_client.cl32_fhkeytype) || - __get_user(karg->ca_client.cl_fhkeylen, - &arg->ca32_client.cl32_fhkeylen) || - __copy_from_user(&karg->ca_client.cl_fhkey[0], - &arg->ca32_client.cl32_fhkey[0], - NFSCLNT_KEYMAX)) - return -EFAULT; - - return 0; -} - -static int compat_nfs_exp_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ, &arg->ca32_export, - sizeof(arg->ca32_export)) || - get_user(karg->ca_version, &arg->ca32_version) || - __copy_from_user(&karg->ca_export.ex_client[0], - &arg->ca32_export.ex32_client[0], - NFSCLNT_IDMAX) || - __copy_from_user(&karg->ca_export.ex_path[0], - &arg->ca32_export.ex32_path[0], - NFS_MAXPATHLEN) || - __get_user(karg->ca_export.ex_dev, - &arg->ca32_export.ex32_dev) || - __get_user(karg->ca_export.ex_ino, - &arg->ca32_export.ex32_ino) || - __get_user(karg->ca_export.ex_flags, - &arg->ca32_export.ex32_flags) || - __get_user(karg->ca_export.ex_anon_uid, - &arg->ca32_export.ex32_anon_uid) || - __get_user(karg->ca_export.ex_anon_gid, - &arg->ca32_export.ex32_anon_gid)) - return -EFAULT; - SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid); - SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid); - - return 0; -} - -static int compat_nfs_getfd_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ, &arg->ca32_getfd, - sizeof(arg->ca32_getfd)) || - get_user(karg->ca_version, &arg->ca32_version) || - __copy_from_user(&karg->ca_getfd.gd_addr, - &arg->ca32_getfd.gd32_addr, - (sizeof(struct sockaddr))) || - __copy_from_user(&karg->ca_getfd.gd_path, - &arg->ca32_getfd.gd32_path, - (NFS_MAXPATHLEN+1)) || - __get_user(karg->ca_getfd.gd_version, - &arg->ca32_getfd.gd32_version)) - return -EFAULT; - - return 0; -} - -static int compat_nfs_getfs_trans(struct nfsctl_arg *karg, - struct compat_nfsctl_arg __user *arg) -{ - if (!access_ok(VERIFY_READ,&arg->ca32_getfs,sizeof(arg->ca32_getfs)) || - get_user(karg->ca_version, &arg->ca32_version) || - __copy_from_user(&karg->ca_getfs.gd_addr, - &arg->ca32_getfs.gd32_addr, - (sizeof(struct sockaddr))) || - __copy_from_user(&karg->ca_getfs.gd_path, - &arg->ca32_getfs.gd32_path, - (NFS_MAXPATHLEN+1)) || - __get_user(karg->ca_getfs.gd_maxlen, - &arg->ca32_getfs.gd32_maxlen)) - return -EFAULT; - - return 0; -} - -/* This really doesn't need translations, we are only passing - * back a union which contains opaque nfs file handle data. - */ -static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, - union compat_nfsctl_res __user *res) -{ - int err; - - err = copy_to_user(res, kres, sizeof(*res)); - - return (err) ? -EFAULT : 0; -} - -asmlinkage long compat_sys_nfsservctl(int cmd, - struct compat_nfsctl_arg __user *arg, - union compat_nfsctl_res __user *res) -{ - struct nfsctl_arg *karg; - union nfsctl_res *kres; - mm_segment_t oldfs; - int err; - - karg = kmalloc(sizeof(*karg), GFP_USER); - kres = kmalloc(sizeof(*kres), GFP_USER); - if(!karg || !kres) { - err = -ENOMEM; - goto done; - } - - switch(cmd) { - case NFSCTL_SVC: - err = compat_nfs_svc_trans(karg, arg); - break; - - case NFSCTL_ADDCLIENT: - err = compat_nfs_clnt_trans(karg, arg); - break; - - case NFSCTL_DELCLIENT: - err = compat_nfs_clnt_trans(karg, arg); - break; - - case NFSCTL_EXPORT: - case NFSCTL_UNEXPORT: - err = compat_nfs_exp_trans(karg, arg); - break; - - case NFSCTL_GETFD: - err = compat_nfs_getfd_trans(karg, arg); - break; - - case NFSCTL_GETFS: - err = compat_nfs_getfs_trans(karg, arg); - break; - - default: - err = -EINVAL; - break; - } - - if (err) - goto done; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - /* The __user pointer casts are valid because of the set_fs() */ - err = sys_nfsservctl(cmd, (void __user *) karg, (void __user *) kres); - set_fs(oldfs); - - if (err) - goto done; - - if((cmd == NFSCTL_GETFD) || - (cmd == NFSCTL_GETFS)) - err = compat_nfs_getfh_res_trans(kres, res); - -done: - kfree(karg); - kfree(kres); - return err; -} -#else /* !NFSD */ long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2) { return sys_ni_syscall(); } -#endif #ifdef CONFIG_EPOLL diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 61abb638b4b..8be086e9abe 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -68,6 +68,8 @@ #ifdef CONFIG_BLOCK #include <linux/loop.h> +#include <linux/cdrom.h> +#include <linux/fd.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> #include <scsi/sg.h> @@ -944,6 +946,9 @@ COMPATIBLE_IOCTL(FIOQSIZE) IGNORE_IOCTL(LOOP_CLR_FD) /* md calls this on random blockdevs */ IGNORE_IOCTL(RAID_VERSION) +/* qemu/qemu-img might call these two on plain files for probing */ +IGNORE_IOCTL(CDROM_DRIVE_STATUS) +IGNORE_IOCTL(FDGETPRM32) /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) diff --git a/fs/dcache.c b/fs/dcache.c index be18598c7fd..b05aac3a8cf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2138,8 +2138,9 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. Caller hold - * rename_lock. + * dcache entries should not be moved in this way. Caller must hold + * rename_lock, the i_mutex of the source and target directories, + * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry * dentry, struct dentry * target) { @@ -2202,7 +2203,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target) * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. + * dcache entries should not be moved in this way. See the locking + * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { @@ -2320,7 +2322,8 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) * @inode: inode to bind to the dentry, to which aliases may be attached * * Introduces an dentry into the tree, substituting an extant disconnected - * root directory alias in its place if there is one + * root directory alias in its place if there is one. Caller must hold the + * i_mutex of the parent directory. */ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 01d2d9ef609..44a360ca804 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -35,7 +35,7 @@ #include <linux/buffer_head.h> #include <linux/rwsem.h> #include <linux/uio.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * How many user pages to map in one call to get_user_pages(). This determines diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index e2b87800436..01fd5c11a7f 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -92,7 +92,7 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number, op->info.number = number; op->info.start = 0; op->info.end = OFFSET_MAX; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) + if (fl->fl_lmops && fl->fl_lmops->lm_grant) op->info.owner = (__u64) fl->fl_pid; else op->info.owner = (__u64)(long) fl->fl_owner; @@ -128,11 +128,11 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) { + if (fl->fl_lmops && fl->fl_lmops->lm_grant) { /* fl_owner is lockd which doesn't distinguish processes on the nfs client */ op->info.owner = (__u64) fl->fl_pid; - xop->callback = fl->fl_lmops->fl_grant; + xop->callback = fl->fl_lmops->lm_grant; locks_init_lock(&xop->flc); locks_copy_lock(&xop->flc, fl); xop->fl = fl; @@ -268,7 +268,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) + if (fl->fl_lmops && fl->fl_lmops->lm_grant) op->info.owner = (__u64) fl->fl_pid; else op->info.owner = (__u64)(long) fl->fl_owner; @@ -327,7 +327,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) + if (fl->fl_lmops && fl->fl_lmops->lm_grant) op->info.owner = (__u64) fl->fl_pid; else op->info.owner = (__u64)(long) fl->fl_owner; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 43c7c43b06f..b36c5572b3f 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -29,6 +29,7 @@ #define ECRYPTFS_KERNEL_H #include <keys/user-type.h> +#include <keys/encrypted-type.h> #include <linux/fs.h> #include <linux/fs_stack.h> #include <linux/namei.h> @@ -36,125 +37,18 @@ #include <linux/hash.h> #include <linux/nsproxy.h> #include <linux/backing-dev.h> +#include <linux/ecryptfs.h> -/* Version verification for shared data structures w/ userspace */ -#define ECRYPTFS_VERSION_MAJOR 0x00 -#define ECRYPTFS_VERSION_MINOR 0x04 -#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03 -/* These flags indicate which features are supported by the kernel - * module; userspace tools such as the mount helper read - * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine - * how to behave. */ -#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 -#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 -#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004 -#define ECRYPTFS_VERSIONING_POLICY 0x00000008 -#define ECRYPTFS_VERSIONING_XATTR 0x00000010 -#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 -#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 -#define ECRYPTFS_VERSIONING_HMAC 0x00000080 -#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 -#define ECRYPTFS_VERSIONING_GCM 0x00000200 -#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ - | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ - | ECRYPTFS_VERSIONING_PUBKEY \ - | ECRYPTFS_VERSIONING_XATTR \ - | ECRYPTFS_VERSIONING_MULTKEY \ - | ECRYPTFS_VERSIONING_DEVMISC \ - | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) -#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 -#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH -#define ECRYPTFS_SALT_SIZE 8 -#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2) -/* The original signature size is only for what is stored on disk; all - * in-memory representations are expanded hex, so it better adapted to - * be passed around or referenced on the command line */ -#define ECRYPTFS_SIG_SIZE 8 -#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2) -#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX -#define ECRYPTFS_MAX_KEY_BYTES 64 -#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512 #define ECRYPTFS_DEFAULT_IV_BYTES 16 -#define ECRYPTFS_FILE_VERSION 0x03 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 #define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 #define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 #define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ #define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) -#define ECRYPTFS_MAX_PKI_NAME_BYTES 16 #define ECRYPTFS_DEFAULT_NUM_USERS 4 #define ECRYPTFS_MAX_NUM_USERS 32768 #define ECRYPTFS_XATTR_NAME "user.ecryptfs" -#define RFC2440_CIPHER_DES3_EDE 0x02 -#define RFC2440_CIPHER_CAST_5 0x03 -#define RFC2440_CIPHER_BLOWFISH 0x04 -#define RFC2440_CIPHER_AES_128 0x07 -#define RFC2440_CIPHER_AES_192 0x08 -#define RFC2440_CIPHER_AES_256 0x09 -#define RFC2440_CIPHER_TWOFISH 0x0a -#define RFC2440_CIPHER_CAST_6 0x0b - -#define RFC2440_CIPHER_RSA 0x01 - -/** - * For convenience, we may need to pass around the encrypted session - * key between kernel and userspace because the authentication token - * may not be extractable. For example, the TPM may not release the - * private key, instead requiring the encrypted data and returning the - * decrypted data. - */ -struct ecryptfs_session_key { -#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001 -#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002 -#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004 -#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008 - u32 flags; - u32 encrypted_key_size; - u32 decrypted_key_size; - u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; - u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES]; -}; - -struct ecryptfs_password { - u32 password_bytes; - s32 hash_algo; - u32 hash_iterations; - u32 session_key_encryption_key_bytes; -#define ECRYPTFS_PERSISTENT_PASSWORD 0x01 -#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02 - u32 flags; - /* Iterated-hash concatenation of salt and passphrase */ - u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; - u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; - /* Always in expanded hex */ - u8 salt[ECRYPTFS_SALT_SIZE]; -}; - -enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY}; - -struct ecryptfs_private_key { - u32 key_size; - u32 data_len; - u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; - char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1]; - u8 data[]; -}; - -/* May be a password or a private key */ -struct ecryptfs_auth_tok { - u16 version; /* 8-bit major and 8-bit minor */ - u16 token_type; -#define ECRYPTFS_ENCRYPT_ONLY 0x00000001 - u32 flags; - struct ecryptfs_session_key session_key; - u8 reserved[32]; - union { - struct ecryptfs_password password; - struct ecryptfs_private_key private_key; - } token; -} __attribute__ ((packed)); - void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); @@ -185,11 +79,47 @@ struct ecryptfs_page_crypt_context { } param; }; +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + if (key->type == &key_type_encrypted) + return (struct ecryptfs_auth_tok *) + (&((struct encrypted_key_payload *)key->payload.data)->payload_data); + else + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return request_key(&key_type_encrypted, sig, NULL); +} + +#else +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return ERR_PTR(-ENOKEY); +} + +#endif /* CONFIG_ENCRYPTED_KEYS */ + static inline struct ecryptfs_auth_tok * ecryptfs_get_key_payload_data(struct key *key) { - return (struct ecryptfs_auth_tok *) - (((struct user_key_payload*)key->payload.data)->data); + struct ecryptfs_auth_tok *auth_tok; + + auth_tok = ecryptfs_get_encrypted_key_payload_data(key); + if (!auth_tok) + return (struct ecryptfs_auth_tok *) + (((struct user_key_payload *)key->payload.data)->data); + else + return auth_tok; } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 340c657a108..11f8582d721 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -69,6 +69,7 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque) inode->i_ino = lower_inode->i_ino; inode->i_version++; inode->i_mapping->a_ops = &ecryptfs_aops; + inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; if (S_ISLNK(inode->i_mode)) inode->i_op = &ecryptfs_symlink_iops; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 27a7fefb83e..08a2b52bf56 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1635,11 +1635,14 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, (*auth_tok_key) = request_key(&key_type_user, sig, NULL); if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { - printk(KERN_ERR "Could not find key with description: [%s]\n", - sig); - rc = process_request_key_err(PTR_ERR(*auth_tok_key)); - (*auth_tok_key) = NULL; - goto out; + (*auth_tok_key) = ecryptfs_get_encrypted_key(sig); + if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { + printk(KERN_ERR "Could not find key with description: [%s]\n", + sig); + rc = process_request_key_err(PTR_ERR(*auth_tok_key)); + (*auth_tok_key) = NULL; + goto out; + } } down_write(&(*auth_tok_key)->sem); rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok); @@ -1868,11 +1871,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, * just one will be sufficient to decrypt to get the FEK. */ find_next_matching_auth_tok: found_auth_tok = 0; - if (auth_tok_key) { - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - auth_tok_key = NULL; - } list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { candidate_auth_tok = &auth_tok_list_item->auth_tok; if (unlikely(ecryptfs_verbosity > 0)) { @@ -1909,14 +1907,22 @@ found_matching_auth_tok: memcpy(&(candidate_auth_tok->token.private_key), &(matching_auth_tok->token.private_key), sizeof(struct ecryptfs_private_key)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); rc = decrypt_pki_encrypted_session_key(candidate_auth_tok, crypt_stat); } else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) { memcpy(&(candidate_auth_tok->token.password), &(matching_auth_tok->token.password), sizeof(struct ecryptfs_password)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); rc = decrypt_passphrase_encrypted_session_key( candidate_auth_tok, crypt_stat); + } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + rc = -EINVAL; } if (rc) { struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp; @@ -1956,15 +1962,12 @@ found_matching_auth_tok: out_wipe_list: wipe_auth_tok_list(&auth_tok_list); out: - if (auth_tok_key) { - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - } return rc; } static int -pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, +pki_encrypt_session_key(struct key *auth_tok_key, + struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_key_record *key_rec) { @@ -1979,6 +1982,8 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, crypt_stat->cipher, crypt_stat->key_size), crypt_stat, &payload, &payload_len); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); goto out; @@ -2008,6 +2013,8 @@ out: * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet * @dest: Buffer into which to write the packet * @remaining_bytes: Maximum number of bytes that can be writtn + * @auth_tok_key: The authentication token key to unlock and put when done with + * @auth_tok * @auth_tok: The authentication token used for generating the tag 1 packet * @crypt_stat: The cryptographic context * @key_rec: The key record struct for the tag 1 packet @@ -2018,7 +2025,7 @@ out: */ static int write_tag_1_packet(char *dest, size_t *remaining_bytes, - struct ecryptfs_auth_tok *auth_tok, + struct key *auth_tok_key, struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_key_record *key_rec, size_t *packet_size) { @@ -2039,12 +2046,15 @@ write_tag_1_packet(char *dest, size_t *remaining_bytes, memcpy(key_rec->enc_key, auth_tok->session_key.encrypted_key, auth_tok->session_key.encrypted_key_size); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); goto encrypted_session_key_set; } if (auth_tok->session_key.encrypted_key_size == 0) auth_tok->session_key.encrypted_key_size = auth_tok->token.private_key.key_size; - rc = pki_encrypt_session_key(auth_tok, crypt_stat, key_rec); + rc = pki_encrypt_session_key(auth_tok_key, auth_tok, crypt_stat, + key_rec); if (rc) { printk(KERN_ERR "Failed to encrypt session key via a key " "module; rc = [%d]\n", rc); @@ -2248,7 +2258,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, auth_tok->token.password.session_key_encryption_key, crypt_stat->key_size); ecryptfs_printk(KERN_DEBUG, - "Cached session key " "encryption key: \n"); + "Cached session key encryption key:\n"); if (ecryptfs_verbosity > 0) ecryptfs_dump_hex(session_key_encryption_key, 16); } @@ -2421,6 +2431,8 @@ ecryptfs_generate_key_packet_set(char *dest_base, &max, auth_tok, crypt_stat, key_rec, &written); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); if (rc) { ecryptfs_printk(KERN_WARNING, "Error " "writing tag 3 packet\n"); @@ -2438,8 +2450,8 @@ ecryptfs_generate_key_packet_set(char *dest_base, } (*len) += written; } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { - rc = write_tag_1_packet(dest_base + (*len), - &max, auth_tok, + rc = write_tag_1_packet(dest_base + (*len), &max, + auth_tok_key, auth_tok, crypt_stat, key_rec, &written); if (rc) { ecryptfs_printk(KERN_WARNING, "Error " @@ -2448,14 +2460,13 @@ ecryptfs_generate_key_packet_set(char *dest_base, } (*len) += written; } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); ecryptfs_printk(KERN_WARNING, "Unsupported " "authentication token type\n"); rc = -EINVAL; goto out_free; } - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - auth_tok_key = NULL; } if (likely(max > 0)) { dest_base[(*len)] = 0x00; @@ -2468,11 +2479,6 @@ out_free: out: if (rc) (*len) = 0; - if (auth_tok_key) { - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - } - mutex_unlock(&crypt_stat->keysig_list_mutex); return rc; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f9cfd168fbe..fe047d966dc 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -37,7 +37,7 @@ #include <asm/system.h> #include <asm/io.h> #include <asm/mman.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * LOCKING: diff --git a/fs/exec.c b/fs/exec.c index 842d5700c15..da80612a35f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -181,14 +181,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) return; bprm->vma_pages = pages; - -#ifdef SPLIT_RSS_COUNTING - add_mm_counter(mm, MM_ANONPAGES, diff); -#else - spin_lock(&mm->page_table_lock); add_mm_counter(mm, MM_ANONPAGES, diff); - spin_unlock(&mm->page_table_lock); -#endif } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, @@ -277,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; @@ -1430,9 +1423,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) } } read_unlock(&binfmt_lock); +#ifdef CONFIG_MODULES if (retval != -ENOEXEC || bprm->mm == NULL) { break; -#ifdef CONFIG_MODULES } else { #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) if (printable(bprm->buf[0]) && @@ -1440,9 +1433,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) printable(bprm->buf[2]) && printable(bprm->buf[3])) break; /* -ENOEXEC */ + if (try) + break; /* -ENOEXEC */ request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); -#endif } +#else + break; +#endif } return retval; } @@ -1649,15 +1646,26 @@ expand_fail: return ret; } +static void cn_escape(char *str) +{ + for (; *str; str++) + if (*str == '/') + *str = '!'; +} + static int cn_print_exe_file(struct core_name *cn) { struct file *exe_file; - char *pathbuf, *path, *p; + char *pathbuf, *path; int ret; exe_file = get_mm_exe_file(current->mm); - if (!exe_file) - return cn_printf(cn, "(unknown)"); + if (!exe_file) { + char *commstart = cn->corename + cn->used; + ret = cn_printf(cn, "%s (path unknown)", current->comm); + cn_escape(commstart); + return ret; + } pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); if (!pathbuf) { @@ -1671,9 +1679,7 @@ static int cn_print_exe_file(struct core_name *cn) goto free_buf; } - for (p = path; *p; p++) - if (*p == '/') - *p = '!'; + cn_escape(path); ret = cn_printf(cn, "%s", path); @@ -1745,16 +1751,22 @@ static int format_corename(struct core_name *cn, long signr) break; } /* hostname */ - case 'h': + case 'h': { + char *namestart = cn->corename + cn->used; down_read(&uts_sem); err = cn_printf(cn, "%s", utsname()->nodename); up_read(&uts_sem); + cn_escape(namestart); break; + } /* executable */ - case 'e': + case 'e': { + char *commstart = cn->corename + cn->used; err = cn_printf(cn, "%s", current->comm); + cn_escape(commstart); break; + } case 'E': err = cn_print_exe_file(cn); break; @@ -2118,16 +2130,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(&cn, signr); - if (ispipe == -ENOMEM) { - printk(KERN_WARNING "format_corename failed\n"); - printk(KERN_WARNING "Aborting core\n"); - goto fail_corename; - } - if (ispipe) { int dump_count; char **helper_argv; + if (ispipe < 0) { + printk(KERN_WARNING "format_corename failed\n"); + printk(KERN_WARNING "Aborting core\n"); + goto fail_corename; + } + if (cprm.limit == 1) { /* * Normally core limits are irrelevant to pipes, since diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index bfe651f9ae1..52c05376394 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -128,7 +128,7 @@ fail: /* * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext2_get_acl(struct inode *inode, int type) { int name_index; @@ -231,29 +231,6 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -int -ext2_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * @@ -276,29 +253,20 @@ ext2_init_acl(struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - struct posix_acl *clone; - mode_t mode; - + mode_t mode = inode->i_mode; if (S_ISDIR(inode->i_mode)) { error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - clone = posix_acl_clone(acl, GFP_KERNEL); - error = -ENOMEM; - if (!clone) - goto cleanup; - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) { - /* This is an extended ACL */ - error = ext2_set_acl(inode, - ACL_TYPE_ACCESS, clone); - } + error = posix_acl_create(&acl, GFP_KERNEL, &mode); + if (error < 0) + return error; + inode->i_mode = mode; + if (error > 0) { + /* This is an extended ACL */ + error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); } - posix_acl_release(clone); } cleanup: posix_acl_release(acl); @@ -322,7 +290,7 @@ cleanup: int ext2_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int error; if (!test_opt(inode->i_sb, POSIX_ACL)) @@ -332,14 +300,11 @@ ext2_acl_chmod(struct inode *inode) acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; + error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) - error = ext2_set_acl(inode, ACL_TYPE_ACCESS, clone); - posix_acl_release(clone); return error; } diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 3ff6cbb9ac4..503bfb0ed79 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,13 +54,12 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_check_acl (struct inode *, int); +extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); #else #include <linux/sched.h> -#define ext2_check_acl NULL #define ext2_get_acl NULL #define ext2_set_acl NULL diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 82e06321de3..a5b3a5db312 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -102,6 +102,6 @@ const struct inode_operations ext2_file_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .check_acl = ext2_check_acl, + .get_acl = ext2_get_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index d60b7099e2d..761fde807fc 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -408,7 +408,7 @@ const struct inode_operations ext2_dir_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .check_acl = ext2_check_acl, + .get_acl = ext2_get_acl, }; const struct inode_operations ext2_special_inode_operations = { @@ -419,5 +419,5 @@ const struct inode_operations ext2_special_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .check_acl = ext2_check_acl, + .get_acl = ext2_get_acl, }; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 529970617a2..d27b71f1d18 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -161,6 +161,10 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, if (name == NULL) return -EINVAL; + name_len = strlen(name); + if (name_len > 255) + return -ERANGE; + down_read(&EXT2_I(inode)->xattr_sem); error = -ENODATA; if (!EXT2_I(inode)->i_file_acl) @@ -181,12 +185,8 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", error = -EIO; goto cleanup; } - /* find named attribute */ - name_len = strlen(name); - error = -ERANGE; - if (name_len > 255) - goto cleanup; + /* find named attribute */ entry = FIRST_ENTRY(bh); while (!IS_LAST_ENTRY(entry)) { struct ext2_xattr_entry *next = diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index edfeb293d4c..6c29bf0df04 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -131,7 +131,7 @@ fail: * * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext3_get_acl(struct inode *inode, int type) { int name_index; @@ -239,29 +239,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -int -ext3_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - /* * Initialize the ACLs of a new inode. Called from ext3_new_inode. * @@ -284,8 +261,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - struct posix_acl *clone; - mode_t mode; + mode_t mode = inode->i_mode; if (S_ISDIR(inode->i_mode)) { error = ext3_set_acl(handle, inode, @@ -293,22 +269,15 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) if (error) goto cleanup; } - clone = posix_acl_clone(acl, GFP_NOFS); - error = -ENOMEM; - if (!clone) - goto cleanup; - - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) { - /* This is an extended ACL */ - error = ext3_set_acl(handle, inode, - ACL_TYPE_ACCESS, clone); - } + error = posix_acl_create(&acl, GFP_NOFS, &mode); + if (error < 0) + return error; + + inode->i_mode = mode; + if (error > 0) { + /* This is an extended ACL */ + error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); } - posix_acl_release(clone); } cleanup: posix_acl_release(acl); @@ -332,7 +301,9 @@ cleanup: int ext3_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; + handle_t *handle; + int retries = 0; int error; if (S_ISLNK(inode->i_mode)) @@ -342,31 +313,24 @@ ext3_acl_chmod(struct inode *inode) acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) { - handle_t *handle; - int retries = 0; - - retry: - handle = ext3_journal_start(inode, - EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext3_std_error(inode->i_sb, error); - goto out; - } - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); - ext3_journal_stop(handle); - if (error == -ENOSPC && - ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; +retry: + handle = ext3_journal_start(inode, + EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext3_std_error(inode->i_sb, error); + goto out; } + error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + ext3_journal_stop(handle); + if (error == -ENOSPC && + ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; out: - posix_acl_release(clone); + posix_acl_release(acl); return error; } diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 597334626de..dbc921e458c 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_check_acl (struct inode *, int); +extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext3_check_acl NULL +#define ext3_get_acl NULL static inline int ext3_acl_chmod(struct inode *inode) diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index fe52297e31a..6386d76f44a 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -21,6 +21,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> +#include <trace/events/ext3.h> /* * balloc.c contains the blocks allocation and deallocation routines @@ -161,6 +162,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) desc = ext3_get_group_desc(sb, block_group, NULL); if (!desc) return NULL; + trace_ext3_read_block_bitmap(sb, block_group); bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { @@ -351,6 +353,7 @@ void ext3_rsv_window_add(struct super_block *sb, struct rb_node * parent = NULL; struct ext3_reserve_window_node *this; + trace_ext3_rsv_window_add(sb, rsv); while (*p) { parent = *p; @@ -476,8 +479,10 @@ void ext3_discard_reservation(struct inode *inode) rsv = &block_i->rsv_window_node; if (!rsv_is_empty(&rsv->rsv_window)) { spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) + if (!rsv_is_empty(&rsv->rsv_window)) { + trace_ext3_discard_reservation(inode, rsv); rsv_window_remove(inode->i_sb, rsv); + } spin_unlock(rsv_lock); } } @@ -683,14 +688,10 @@ error_return: void ext3_free_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t block, unsigned long count) { - struct super_block * sb; + struct super_block *sb = inode->i_sb; unsigned long dquot_freed_blocks; - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } + trace_ext3_free_blocks(inode, block, count); ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) dquot_free_block(inode, dquot_freed_blocks); @@ -1136,6 +1137,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, else start_block = grp_goal + group_first_block; + trace_ext3_alloc_new_reservation(sb, start_block); size = my_rsv->rsv_goal_size; if (!rsv_is_empty(&my_rsv->rsv_window)) { @@ -1230,8 +1232,11 @@ retry: * check if the first free block is within the * free space we just reserved */ - if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + if (start_block >= my_rsv->rsv_start && + start_block <= my_rsv->rsv_end) { + trace_ext3_reserved(sb, start_block, my_rsv); return 0; /* success */ + } /* * if the first free bit we found is out of the reservable space * continue search for next reservable space, @@ -1514,10 +1519,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, *errp = -ENOSPC; sb = inode->i_sb; - if (!sb) { - printk("ext3_new_block: nonexistent device"); - return 0; - } /* * Check quota for allocation of this block. @@ -1528,8 +1529,10 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, return 0; } + trace_ext3_request_blocks(inode, goal, num); + sbi = EXT3_SB(sb); - es = EXT3_SB(sb)->s_es; + es = sbi->s_es; ext3_debug("goal=%lu.\n", goal); /* * Allocate a block from reservation only when @@ -1742,6 +1745,10 @@ allocated: brelse(bitmap_bh); dquot_free_block(inode, *count-num); *count = num; + + trace_ext3_allocate_blocks(inode, goal, num, + (unsigned long long)ret_block); + return ret_block; io_error: @@ -1996,6 +2003,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, if ((next - start) < minblocks) goto free_extent; + trace_ext3_discard_blocks(sb, discard_block, next - start); /* Send the TRIM command down to the device */ err = sb_issue_discard(sb, discard_block, next - start, GFP_NOFS, 0); @@ -2100,7 +2108,7 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) return -EINVAL; if (start >= max_blks) - goto out; + return -EINVAL; if (start + len > max_blks) len = max_blks - start; @@ -2148,8 +2156,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (ret >= 0) ret = 0; - -out: range->len = trimmed * sb->s_blocksize; return ret; diff --git a/fs/ext3/file.c b/fs/ext3/file.c index f55df0e61cb..724df69847d 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -71,7 +71,6 @@ const struct file_operations ext3_file_operations = { }; const struct inode_operations ext3_file_inode_operations = { - .truncate = ext3_truncate, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, @@ -79,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 0bcf63adb80..d494c554c6e 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -30,6 +30,7 @@ #include <linux/jbd.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> +#include <trace/events/ext3.h> /* * akpm: A new design for ext3_sync_file(). @@ -51,12 +52,14 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret, needs_barrier = 0; tid_t commit_tid; + trace_ext3_sync_file_enter(file, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) return 0; ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) - return ret; + goto out; /* * Taking the mutex here just to keep consistent with how fsync was @@ -83,7 +86,8 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (ext3_should_journal_data(inode)) { mutex_unlock(&inode->i_mutex); - return ext3_force_commit(inode->i_sb); + ret = ext3_force_commit(inode->i_sb); + goto out; } if (datasync) @@ -104,6 +108,9 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + mutex_unlock(&inode->i_mutex); +out: + trace_ext3_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bfc2dc43681..bf09cbf938c 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -23,6 +23,7 @@ #include <linux/buffer_head.h> #include <linux/random.h> #include <linux/bitops.h> +#include <trace/events/ext3.h> #include <asm/byteorder.h> @@ -118,6 +119,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) ino = inode->i_ino; ext3_debug ("freeing inode %lu\n", ino); + trace_ext3_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); @@ -426,6 +428,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_ext3_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -601,6 +604,7 @@ got: } ext3_debug("allocating inode %lu\n", inode->i_ino); + trace_ext3_allocate_inode(inode, dir, mode); goto really_out; fail: ext3_std_error(sb, err); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2978a2a17a5..04da6acde85 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -38,10 +38,12 @@ #include <linux/bio.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <trace/events/ext3.h> #include "xattr.h" #include "acl.h" static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); /* * Test whether an inode is a fast symlink. @@ -70,6 +72,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext3_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -194,20 +197,47 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode) */ void ext3_evict_inode (struct inode *inode) { + struct ext3_inode_info *ei = EXT3_I(inode); struct ext3_block_alloc_info *rsv; handle_t *handle; int want_delete = 0; + trace_ext3_evict_inode(inode); if (!inode->i_nlink && !is_bad_inode(inode)) { dquot_initialize(inode); want_delete = 1; } + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); ext3_discard_reservation(inode); - rsv = EXT3_I(inode)->i_block_alloc_info; - EXT3_I(inode)->i_block_alloc_info = NULL; + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); @@ -231,15 +261,13 @@ void ext3_evict_inode (struct inode *inode) if (inode->i_blocks) ext3_truncate(inode); /* - * Kill off the orphan record which ext3_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext3_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. */ ext3_orphan_del(handle, inode); - EXT3_I(inode)->i_dtime = get_seconds(); + ei->i_dtime = get_seconds(); /* * One subtle ordering requirement: if anything has gone wrong @@ -842,6 +870,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, ext3_fsblk_t first_block = 0; + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -886,6 +915,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!create || err == -EIO) goto cleanup; + /* + * Block out ext3_truncate while we alter the tree + */ mutex_lock(&ei->truncate_mutex); /* @@ -934,9 +966,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, */ count = ext3_blks_to_allocate(partial, indirect_blks, maxblocks, blocks_to_boundary); - /* - * Block out ext3_truncate while we alter the tree - */ err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -970,6 +999,9 @@ cleanup: } BUFFER_TRACE(bh_result, "returned"); out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); return err; } @@ -1202,6 +1234,16 @@ static void ext3_truncate_failed_write(struct inode *inode) ext3_truncate(inode); } +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1217,6 +1259,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, * we allocate blocks but write fails for some reason */ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + trace_ext3_write_begin(inode, pos, len, flags); + index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1332,6 +1376,7 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; + trace_ext3_ordered_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1367,6 +1412,7 @@ static int ext3_writeback_write_end(struct file *file, struct inode *inode = file->f_mapping->host; int ret; + trace_ext3_writeback_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); update_file_sizes(inode, pos, copied); /* @@ -1391,10 +1437,12 @@ static int ext3_journalled_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); int ret = 0, ret2; int partial = 0; unsigned from, to; + trace_ext3_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1419,8 +1467,9 @@ static int ext3_journalled_write_end(struct file *file, if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_set_inode_state(inode, EXT3_STATE_JDATA); - if (inode->i_size > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = inode->i_size; + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1577,6 +1626,7 @@ static int ext3_ordered_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_ordered_writepage(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1647,6 +1697,7 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_writeback_writepage(page); if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { @@ -1689,6 +1740,7 @@ static int ext3_journalled_writepage(struct page *page, if (ext3_journal_current_handle()) goto no_write; + trace_ext3_journalled_writepage(page); handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1715,6 +1767,8 @@ static int ext3_journalled_writepage(struct page *page, if (ret == 0) ret = err; ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); unlock_page(page); } else { /* @@ -1739,6 +1793,7 @@ out_unlock: static int ext3_readpage(struct file *file, struct page *page) { + trace_ext3_readpage(page); return mpage_readpage(page, ext3_get_block); } @@ -1753,6 +1808,8 @@ static void ext3_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_invalidatepage(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ @@ -1766,6 +1823,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_releasepage(page); WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -1794,6 +1852,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_length(iov, nr_segs); int retries = 0; + trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + if (rw == WRITE) { loff_t final_size = offset + count; @@ -1827,7 +1887,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + ext3_truncate_failed_direct_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1841,7 +1901,7 @@ retry: /* This is really bad luck. We've written the data * but cannot extend i_size. Truncate allocated blocks * and pretend the write failed... */ - ext3_truncate(inode); + ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); goto out; } @@ -1867,6 +1927,8 @@ retry: ret = err; } out: + trace_ext3_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); return ret; } @@ -1949,17 +2011,24 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, length, pos; - struct inode *inode = mapping->host; + struct page *page; + handle_t *handle = NULL; struct buffer_head *bh; int err = 0; + /* Truncated on block boundary - nothing to do */ blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -2004,11 +2073,23 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); + goto unlock; + } + } + if (ext3_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext3_journal_get_write_access(handle, bh); if (err) - goto unlock; + goto stop; } zero_user(page, offset, length); @@ -2022,6 +2103,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, err = ext3_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } +stop: + if (handle) + ext3_journal_stop(handle); unlock: unlock_page(page); @@ -2390,8 +2474,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, int ext3_can_truncate(struct inode *inode) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -2435,7 +2517,6 @@ void ext3_truncate(struct inode *inode) struct ext3_inode_info *ei = EXT3_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -2443,7 +2524,8 @@ void ext3_truncate(struct inode *inode) int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; + + trace_ext3_truncate_enter(inode); if (!ext3_can_truncate(inode)) goto out_notrans; @@ -2451,37 +2533,12 @@ void ext3_truncate(struct inode *inode) if (inode->i_size == 0 && ext3_should_writeback_data(inode)) ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - /* - * We have to lock the EOF page here, because lock_page() nests - * outside journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - goto out_notrans; - } - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) goto out_notrans; - } last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - - if (page) - ext3_block_truncate_page(handle, page, mapping, inode->i_size); - n = ext3_block_to_path(inode, last_block, offsets, NULL); if (n == 0) goto out_stop; /* error */ @@ -2596,6 +2653,7 @@ out_stop: ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); return; out_notrans: /* @@ -2604,6 +2662,7 @@ out_notrans: */ if (inode->i_nlink) ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); } static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, @@ -2745,6 +2804,7 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ_META, bh); @@ -3229,18 +3289,36 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) } error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } EXT3_I(inode)->i_disksize = attr->ia_size; - rc = ext3_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } } if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - rc = vmtruncate(inode, attr->ia_size); - if (rc) - goto err_out; + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); } setattr_copy(inode, attr); @@ -3374,6 +3452,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) int err; might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index f4090bd2f34..c7f43944f16 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -285,7 +285,7 @@ group_add_out: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&range, (struct fstrim_range *)arg, + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -293,7 +293,7 @@ group_add_out: if (ret < 0) return ret; - if (copy_to_user((struct fstrim_range *)arg, &range, + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index c095cf5640c..6e18a0b7750 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -36,6 +36,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> +#include <trace/events/ext3.h> #include "namei.h" #include "xattr.h" @@ -287,7 +288,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent while (len--) printk("%c", *name++); ext3fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT3_DIR_REC_LEN(de->name_len); names++; @@ -1013,7 +1014,7 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir, *err = -ENOENT; errout: - dxtrace(printk("%s not found\n", name)); + dxtrace(printk("%s not found\n", entry->name)); dx_release (frames); return NULL; } @@ -2140,6 +2141,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) struct ext3_dir_entry_2 * de; handle_t *handle; + trace_ext3_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); @@ -2185,6 +2187,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) end_unlink: ext3_journal_stop(handle); brelse (bh); + trace_ext3_unlink_exit(dentry, retval); return retval; } @@ -2529,7 +2532,7 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2540,5 +2543,5 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext3_check_acl, + .get_acl = ext3_get_acl, }; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b57ea2f9126..7beb69ae001 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -44,6 +44,9 @@ #include "acl.h" #include "namei.h" +#define CREATE_TRACE_POINTS +#include <trace/events/ext3.h> + #ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA #else @@ -497,6 +500,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static int ext3_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext3_drop_inode(inode, drop); + return drop; +} + static void ext3_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -788,6 +799,7 @@ static const struct super_operations ext3_sops = { .destroy_inode = ext3_destroy_inode, .write_inode = ext3_write_inode, .dirty_inode = ext3_dirty_inode, + .drop_inode = ext3_drop_inode, .evict_inode = ext3_evict_inode, .put_super = ext3_put_super, .sync_fs = ext3_sync_fs, @@ -2509,6 +2521,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_ext3_sync_fs(sb, wait); if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 32e6cc23bd9..d565759d82e 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -803,8 +803,16 @@ inserted: /* We need to allocate a new block */ ext3_fsblk_t goal = ext3_group_first_block_no(sb, EXT3_I(inode)->i_block_group); - ext3_fsblk_t block = ext3_new_block(handle, inode, - goal, &error); + ext3_fsblk_t block; + + /* + * Protect us agaist concurrent allocations to the + * same inode from ext3_..._writepage(). Reservation + * code does not expect racing allocations. + */ + mutex_lock(&EXT3_I(inode)->truncate_mutex); + block = ext3_new_block(handle, inode, goal, &error); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 60d900fcc3d..dca2d1ded93 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -131,7 +131,7 @@ fail: * * inode->i_mutex: don't care */ -static struct posix_acl * +struct posix_acl * ext4_get_acl(struct inode *inode, int type) { int name_index; @@ -237,29 +237,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -int -ext4_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * @@ -282,8 +259,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - struct posix_acl *clone; - mode_t mode; + mode_t mode = inode->i_mode; if (S_ISDIR(inode->i_mode)) { error = ext4_set_acl(handle, inode, @@ -291,22 +267,15 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) if (error) goto cleanup; } - clone = posix_acl_clone(acl, GFP_NOFS); - error = -ENOMEM; - if (!clone) - goto cleanup; - - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) { - /* This is an extended ACL */ - error = ext4_set_acl(handle, inode, - ACL_TYPE_ACCESS, clone); - } + error = posix_acl_create(&acl, GFP_NOFS, &mode); + if (error < 0) + return error; + + inode->i_mode = mode; + if (error > 0) { + /* This is an extended ACL */ + error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); } - posix_acl_release(clone); } cleanup: posix_acl_release(acl); @@ -330,9 +299,12 @@ cleanup: int ext4_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; + handle_t *handle; + int retries = 0; int error; + if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; if (!test_opt(inode->i_sb, POSIX_ACL)) @@ -340,31 +312,24 @@ ext4_acl_chmod(struct inode *inode) acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) { - handle_t *handle; - int retries = 0; - - retry: - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext4_std_error(inode->i_sb, error); - goto out; - } - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); - ext4_journal_stop(handle); - if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; +retry: + handle = ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext4_std_error(inode->i_sb, error); + goto out; } + error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; out: - posix_acl_release(clone); + posix_acl_release(acl); return error; } diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 9d843d5deac..18cb39ed7c7 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_check_acl(struct inode *, int); +struct posix_acl *ext4_get_acl(struct inode *inode, int type); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext4_check_acl NULL +#define ext4_get_acl NULL static inline int ext4_acl_chmod(struct inode *inode) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ce766f974b1..e4095e988eb 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -301,7 +301,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext4_check_acl, + .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 678cde834f1..3e5191f9f39 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2741,7 +2741,7 @@ static int write_cache_pages_da(struct address_space *mapping, index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; @@ -2973,7 +2973,7 @@ static int ext4_da_writepages(struct address_space *mapping, } retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); while (!ret && wbc->nr_to_write > 0) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 707d605bf76..8c9babac43d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2590,7 +2590,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext4_check_acl, + .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; @@ -2602,5 +2602,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .check_acl = ext4_check_acl, + .get_acl = ext4_get_acl, }; diff --git a/fs/file_table.c b/fs/file_table.c index 01e4c1e8e6b..c322794f736 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -25,7 +25,7 @@ #include <linux/percpu.h> #include <linux/ima.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "internal.h" diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b8c507ca42f..04cf3b91e50 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -35,7 +35,9 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; + unsigned long *older_than_this; enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; @@ -180,12 +182,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *bdi = inode_to_bdi(inode); + + spin_lock(&bdi->wb.list_lock); list_del_init(&inode->i_wb_list); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); } - /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -195,11 +198,9 @@ void inode_wb_list_del(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode) +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -213,11 +214,9 @@ static void redirty_tail(struct inode *inode) /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ -static void requeue_io(struct inode *inode) +static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } @@ -225,7 +224,7 @@ static void inode_sync_complete(struct inode *inode) { /* * Prevent speculative execution through - * spin_unlock(&inode_wb_list_lock); + * spin_unlock(&wb->list_lock); */ smp_mb(); @@ -250,15 +249,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) /* * Move expired dirty inodes from @delaying_queue to @dispatch_queue. */ -static void move_expired_inodes(struct list_head *delaying_queue, +static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - unsigned long *older_than_this) + unsigned long *older_than_this) { LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; struct inode *inode; int do_sb_sort = 0; + int moved = 0; while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); @@ -269,12 +269,13 @@ static void move_expired_inodes(struct list_head *delaying_queue, do_sb_sort = 1; sb = inode->i_sb; list_move(&inode->i_wb_list, &tmp); + moved++; } /* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); - return; + goto out; } /* Move inodes from one superblock together */ @@ -286,6 +287,8 @@ static void move_expired_inodes(struct list_head *delaying_queue, list_move(&inode->i_wb_list, dispatch_queue); } } +out: + return moved; } /* @@ -301,9 +304,11 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - assert_spin_locked(&inode_wb_list_lock); + int moved; + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); + trace_writeback_queue_io(wb, older_than_this, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -316,7 +321,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) /* * Wait for writeback on an inode to complete. */ -static void inode_wait_for_writeback(struct inode *inode) +static void inode_wait_for_writeback(struct inode *inode, + struct bdi_writeback *wb) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; @@ -324,15 +330,15 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * Write out an inode's dirty pages. Called under wb->list_lock and * inode->i_lock. Either the caller has an active reference on the inode or * the inode has I_WILL_FREE set. * @@ -343,13 +349,15 @@ static void inode_wait_for_writeback(struct inode *inode) * livelocks, etc. */ static int -writeback_single_inode(struct inode *inode, struct writeback_control *wbc) +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; + long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); if (!atomic_read(&inode->i_count)) @@ -367,14 +375,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { - requeue_io(inode); + requeue_io(inode, wb); + trace_writeback_single_inode_requeue(inode, wbc, + nr_to_write); return 0; } /* * It's a data-integrity sync. We must wait. */ - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); } BUG_ON(inode->i_state & I_SYNC); @@ -383,7 +393,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); ret = do_writepages(mapping, wbc); @@ -414,10 +424,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { + /* + * Sync livelock prevention. Each inode is tagged and synced in + * one shot. If still dirty, it will be redirty_tail()'ed below. + * Update the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() @@ -428,7 +447,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * slice used up: queue for next turn */ - requeue_io(inode); + requeue_io(inode, wb); } else { /* * Writeback blocked by something other than @@ -437,7 +456,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode); + redirty_tail(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -446,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * submission or metadata updates after data IO * completion. */ - redirty_tail(inode); + redirty_tail(inode, wb); } else { /* * The inode is clean. At this point we either have @@ -457,9 +476,41 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } } inode_sync_complete(inode); + trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } +static long writeback_chunk_size(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + long pages; + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * writeback_sb_inodes() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) + pages = LONG_MAX; + else { + pages = min(bdi->avg_write_bandwidth / 2, + global_dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + pages = round_down(pages + MIN_WRITEBACK_PAGES, + MIN_WRITEBACK_PAGES); + } + + return pages; +} + /* * Write a portion of b_io inodes which belong to @sb. * @@ -467,24 +518,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * inodes. Otherwise write only ones which go sequentially * in reverse order. * - * Return 1, if the caller writeback routine should be - * interrupted. Otherwise return 0. + * Return the number of pages and/or inodes written. */ -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, - struct writeback_control *wbc, bool only_this_sb) +static long writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) { + struct writeback_control wbc = { + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, + .range_start = 0, + .range_end = LLONG_MAX, + }; + unsigned long start_time = jiffies; + long write_chunk; + long wrote = 0; /* count both pages and inodes */ + while (!list_empty(&wb->b_io)) { - long pages_skipped; struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { - if (only_this_sb) { + if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging * to it back onto the dirty list. */ - redirty_tail(inode); + redirty_tail(inode, wb); continue; } @@ -493,7 +556,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Bounce back to the caller to unpin this and * pin the next superblock. */ - return 0; + break; } /* @@ -504,95 +567,96 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); - requeue_io(inode); + redirty_tail(inode, wb); continue; } - - /* - * Was this inode dirtied after sync_sb_inodes was called? - * This keeps sync from extra jobs and livelock. - */ - if (inode_dirtied_after(inode, wbc->wb_start)) { - spin_unlock(&inode->i_lock); - return 1; - } - __iget(inode); + write_chunk = writeback_chunk_size(wb->bdi, work); + wbc.nr_to_write = write_chunk; + wbc.pages_skipped = 0; + + writeback_single_inode(inode, wb, &wbc); - pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wbc); - if (wbc->pages_skipped != pages_skipped) { + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; + if (!(inode->i_state & I_DIRTY)) + wrote++; + if (wbc.pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode); + redirty_tail(inode, wb); } spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); iput(inode); cond_resched(); - spin_lock(&inode_wb_list_lock); - if (wbc->nr_to_write <= 0) { - wbc->more_io = 1; - return 1; + spin_lock(&wb->list_lock); + /* + * bail out to wb_writeback() often enough to check + * background threshold and other termination conditions. + */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; } - if (!list_empty(&wb->b_more_io)) - wbc->more_io = 1; } - /* b_io is empty */ - return 1; + return wrote; } -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +static long __writeback_inodes_wb(struct bdi_writeback *wb, + struct wb_writeback_work *work) { - int ret = 0; - - if (!wbc->wb_start) - wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); + unsigned long start_time = jiffies; + long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!grab_super_passive(sb)) { - requeue_io(inode); + /* + * grab_super_passive() may fail consistently due to + * s_umount being grabbed by someone else. Don't use + * requeue_io() to avoid busy retrying the inode/sb. + */ + redirty_tail(inode, wb); continue; } - ret = writeback_sb_inodes(sb, wb, wbc, false); + wrote += writeback_sb_inodes(sb, wb, work); drop_super(sb); - if (ret) - break; + /* refer to the same tests at the end of writeback_sb_inodes */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } } - spin_unlock(&inode_wb_list_lock); /* Leave any unwritten inodes on b_io */ + return wrote; } -static void __writeback_inodes_sb(struct super_block *sb, - struct bdi_writeback *wb, struct writeback_control *wbc) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) { - WARN_ON(!rwsem_is_locked(&sb->s_umount)); + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + }; - spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_wb_list_lock); -} + spin_lock(&wb->list_lock); + if (list_empty(&wb->b_io)) + queue_io(wb, NULL); + __writeback_inodes_wb(wb, &work); + spin_unlock(&wb->list_lock); -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 + return nr_pages - work.nr_pages; +} static inline bool over_bground_thresh(void) { @@ -605,6 +669,16 @@ static inline bool over_bground_thresh(void) } /* + * Called under wb->list_lock. If there are multiple wb per bdi, + * only the flusher working on the first wb should do it. + */ +static void wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long start_time) +{ + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); +} + +/* * Explicit flushing or periodic writeback of "old" data. * * Define "old": the first time one of an inode's pages is dirtied, we mark the @@ -622,47 +696,16 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - struct writeback_control wbc = { - .sync_mode = work->sync_mode, - .older_than_this = NULL, - .for_kupdate = work->for_kupdate, - .for_background = work->for_background, - .range_cyclic = work->range_cyclic, - }; + unsigned long wb_start = jiffies; + long nr_pages = work->nr_pages; unsigned long oldest_jif; - long wrote = 0; - long write_chunk; struct inode *inode; + long progress; - if (wbc.for_kupdate) { - wbc.older_than_this = &oldest_jif; - oldest_jif = jiffies - - msecs_to_jiffies(dirty_expire_interval * 10); - } - if (!wbc.range_cyclic) { - wbc.range_start = 0; - wbc.range_end = LLONG_MAX; - } - - /* - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX - * here avoids calling into writeback_inodes_wb() more than once. - * - * The intended call sequence for WB_SYNC_ALL writeback is: - * - * wb_writeback() - * __writeback_inodes_sb() <== called only once - * write_cache_pages() <== called once for each inode - * (quickly) tag currently dirty pages - * (maybe slowly) sync all tagged pages - */ - if (wbc.sync_mode == WB_SYNC_NONE) - write_chunk = MAX_WRITEBACK_PAGES; - else - write_chunk = LONG_MAX; + oldest_jif = jiffies; + work->older_than_this = &oldest_jif; - wbc.wb_start = jiffies; /* livelock avoidance */ + spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -687,52 +730,54 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !over_bground_thresh()) break; - wbc.more_io = 0; - wbc.nr_to_write = write_chunk; - wbc.pages_skipped = 0; + if (work->for_kupdate) { + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + work->older_than_this = &oldest_jif; + } - trace_wbc_writeback_start(&wbc, wb->bdi); + trace_writeback_start(wb->bdi, work); + if (list_empty(&wb->b_io)) + queue_io(wb, work->older_than_this); if (work->sb) - __writeback_inodes_sb(work->sb, wb, &wbc); + progress = writeback_sb_inodes(work->sb, wb, work); else - writeback_inodes_wb(wb, &wbc); - trace_wbc_writeback_written(&wbc, wb->bdi); + progress = __writeback_inodes_wb(wb, work); + trace_writeback_written(wb->bdi, work); - work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + wb_update_bandwidth(wb, wb_start); /* - * If we consumed everything, see if we have more + * Did we write something? Try for more + * + * Dirty inodes are moved to b_io for writeback in batches. + * The completion of the current batch does not necessarily + * mean the overall work is done. So we keep looping as long + * as made some progress on cleaning pages or inodes. */ - if (wbc.nr_to_write <= 0) + if (progress) continue; /* - * Didn't write everything and we don't have more IO, bail + * No more inodes for IO, bail */ - if (!wbc.more_io) + if (list_empty(&wb->b_more_io)) break; /* - * Did we write something? Try for more - */ - if (wbc.nr_to_write < write_chunk) - continue; - /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_wb_list_lock); if (!list_empty(&wb->b_more_io)) { + trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); - trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_wb_list_lock); } + spin_unlock(&wb->list_lock); - return wrote; + return nr_pages - work->nr_pages; } /* @@ -1063,10 +1108,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) } spin_unlock(&inode->i_lock); - spin_lock(&inode_wb_list_lock); + spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); @@ -1162,10 +1207,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .done = &done, - .nr_pages = nr, + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); @@ -1267,6 +1313,7 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -1279,11 +1326,11 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, &wbc); + ret = writeback_single_inode(inode, wb, &wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1303,13 +1350,14 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wbc); + ret = writeback_single_inode(inode, wb, wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7bb685cdd00..d480d9af46c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1507,7 +1507,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; int err; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) { + if (fl->fl_lmops && fl->fl_lmops->lm_grant) { /* NLM needs asynchronous locks, which we don't support yet */ return -ENOLCK; } diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 70e90b4974c..d5e33a077a6 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -132,31 +132,17 @@ generic_acl_init(struct inode *inode, struct inode *dir) if (!S_ISLNK(inode->i_mode)) acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); if (acl) { - struct posix_acl *clone; - - if (S_ISDIR(inode->i_mode)) { - clone = posix_acl_clone(acl, GFP_KERNEL); - error = -ENOMEM; - if (!clone) - goto cleanup; - set_cached_acl(inode, ACL_TYPE_DEFAULT, clone); - posix_acl_release(clone); - } - clone = posix_acl_clone(acl, GFP_KERNEL); - error = -ENOMEM; - if (!clone) - goto cleanup; - error = posix_acl_create_masq(clone, &mode); - if (error >= 0) { - inode->i_mode = mode; - if (error > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, clone); - } - posix_acl_release(clone); + if (S_ISDIR(inode->i_mode)) + set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); + error = posix_acl_create(&acl, GFP_KERNEL, &mode); + if (error < 0) + return error; + inode->i_mode = mode; + if (error > 0) + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); } error = 0; -cleanup: posix_acl_release(acl); return error; } @@ -170,44 +156,22 @@ cleanup: int generic_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int error = 0; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; acl = get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { - clone = posix_acl_clone(acl, GFP_KERNEL); + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) - set_cached_acl(inode, ACL_TYPE_ACCESS, clone); - posix_acl_release(clone); } return error; } -int -generic_check_acl(struct inode *inode, int mask) -{ - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - } else { - struct posix_acl *acl; - - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - } - return -EAGAIN; -} - const struct xattr_handler generic_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 8ef1079f166..884c9af0542 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -67,36 +67,9 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) return acl; } -/** - * gfs2_check_acl - Check an ACL to see if we're allowed to do something - * @inode: the file we want to do something to - * @mask: what we want to do - * - * Returns: errno - */ - -int gfs2_check_acl(struct inode *inode, int mask) +struct posix_acl *gfs2_get_acl(struct inode *inode, int type) { - struct posix_acl *acl; - int error; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; + return gfs2_acl_get(GFS2_I(inode), type); } static int gfs2_set_mode(struct inode *inode, mode_t mode) @@ -143,7 +116,7 @@ out: int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct posix_acl *acl, *clone; + struct posix_acl *acl; mode_t mode = inode->i_mode; int error = 0; @@ -168,16 +141,10 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) goto out; } - clone = posix_acl_clone(acl, GFP_NOFS); - error = -ENOMEM; - if (!clone) - goto out; - posix_acl_release(acl); - acl = clone; - - error = posix_acl_create_masq(acl, &mode); + error = posix_acl_create(&acl, GFP_NOFS, &mode); if (error < 0) - goto out; + return error; + if (error == 0) goto munge; @@ -193,7 +160,7 @@ out: int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; char *data; unsigned int len; int error; @@ -204,25 +171,19 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) if (!acl) return gfs2_setattr_simple(ip, attr); - clone = posix_acl_clone(acl, GFP_NOFS); + error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); + if (error) + return error; + + len = posix_acl_to_xattr(acl, NULL, 0); + data = kmalloc(len, GFP_NOFS); error = -ENOMEM; - if (!clone) + if (data == NULL) goto out; - posix_acl_release(acl); - acl = clone; - - error = posix_acl_chmod_masq(acl, attr->ia_mode); - if (!error) { - len = posix_acl_to_xattr(acl, NULL, 0); - data = kmalloc(len, GFP_NOFS); - error = -ENOMEM; - if (data == NULL) - goto out; - posix_acl_to_xattr(acl, data, len); - error = gfs2_xattr_acl_chmod(ip, attr, data); - kfree(data); - set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); - } + posix_acl_to_xattr(acl, data, len); + error = gfs2_xattr_acl_chmod(ip, attr, data); + kfree(data); + set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); out: posix_acl_release(acl); diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index b522b0cb39e..0da38dc7efe 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -16,7 +16,7 @@ #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" #define GFS2_ACL_MAX_ENTRIES 25 -extern int gfs2_check_acl(struct inode *inode, int mask); +extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); extern const struct xattr_handler gfs2_xattr_system_handler; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 0fb51a96eff..900cf986aad 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1846,7 +1846,7 @@ const struct inode_operations gfs2_file_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .check_acl = gfs2_check_acl, + .get_acl = gfs2_get_acl, }; const struct inode_operations gfs2_dir_iops = { @@ -1867,7 +1867,7 @@ const struct inode_operations gfs2_dir_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .check_acl = gfs2_check_acl, + .get_acl = gfs2_get_acl, }; const struct inode_operations gfs2_symlink_iops = { @@ -1882,6 +1882,6 @@ const struct inode_operations gfs2_symlink_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .check_acl = gfs2_check_acl, + .get_acl = gfs2_get_acl, }; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 29e1ace7953..8a139ff1919 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -16,7 +16,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 516516e0c2a..3bc073a4cf8 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1018,13 +1018,13 @@ hostdata_error: fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); - complete(&sdp->sd_locking_init); + complete_all(&sdp->sd_locking_init); return 0; } ret = lm->lm_mount(sdp, fsname); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); - complete(&sdp->sd_locking_init); + complete_all(&sdp->sd_locking_init); return ret; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7aafeb8fa30..87b6e0421c1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -94,7 +94,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_HUGETLB | VM_RESERVED; vma->vm_ops = &hugetlb_vm_ops; - if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT)) + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; vma_len = (loff_t)(vma->vm_end - vma->vm_start); @@ -1030,6 +1030,7 @@ static int __init init_hugetlbfs_fs(void) static void __exit exit_hugetlbfs_fs(void) { kmem_cache_destroy(hugetlbfs_inode_cachep); + kern_unmount(hugetlbfs_vfsmount); unregister_filesystem(&hugetlbfs_fs_type); bdi_destroy(&hugetlbfs_backing_dev_info); } diff --git a/fs/inode.c b/fs/inode.c index 96c77b81167..d0c72ff6b30 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -37,7 +37,7 @@ * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list - * inode_wb_list_lock protects: + * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash @@ -48,7 +48,7 @@ * inode->i_lock * inode->i_sb->s_inode_lru_lock * - * inode_wb_list_lock + * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock @@ -65,7 +65,6 @@ static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * Empty aops. Can be used for the cases where the user does not @@ -362,9 +361,11 @@ EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { - spin_lock(&inode_sb_list_lock); - list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + if (!list_empty(&inode->i_sb_list)) { + spin_lock(&inode_sb_list_lock); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); + } } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -797,6 +798,29 @@ unsigned int get_next_ino(void) EXPORT_SYMBOL(get_next_ino); /** + * new_inode_pseudo - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *new_inode_pseudo(struct super_block *sb) +{ + struct inode *inode = alloc_inode(sb); + + if (inode) { + spin_lock(&inode->i_lock); + inode->i_state = 0; + spin_unlock(&inode->i_lock); + INIT_LIST_HEAD(&inode->i_sb_list); + } + return inode; +} + +/** * new_inode - obtain an inode * @sb: superblock * @@ -814,13 +838,9 @@ struct inode *new_inode(struct super_block *sb) spin_lock_prefetch(&inode_sb_list_lock); - inode = alloc_inode(sb); - if (inode) { - spin_lock(&inode->i_lock); - inode->i_state = 0; - spin_unlock(&inode->i_lock); + inode = new_inode_pseudo(sb); + if (inode) inode_sb_list_add(inode); - } return inode; } EXPORT_SYMBOL(new_inode); diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index e4b87bc1fa5..f94fc48ff3a 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -22,6 +22,8 @@ #include <linux/jbd.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/blkdev.h> +#include <trace/events/jbd.h> /* * Unlink a buffer from a transaction checkpoint list. @@ -95,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); } else { @@ -220,8 +226,8 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + get_bh(bh); if (buffer_locked(bh)) { - get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -240,7 +246,6 @@ restart: */ released = __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } @@ -253,9 +258,12 @@ static void __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(bhs[i], WRITE); + write_dirty_buffer(bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; @@ -304,12 +312,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; + get_bh(bh); J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } else { /* @@ -358,6 +366,7 @@ int log_do_checkpoint(journal_t *journal) * journal straight away. */ result = cleanup_journal_tail(journal); + trace_jbd_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -503,6 +512,7 @@ int cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, "Cleaning journal tail from %d to %d (offset %u), " "freeing %u\n", @@ -523,9 +533,9 @@ int cleanup_journal_tail(journal_t *journal) /* * journal_clean_one_cp_list * - * Find all the written-back checkpoint buffers in the given list and release them. + * Find all the written-back checkpoint buffers in the given list and release + * them. * - * Called with the journal locked. * Called with j_list_lock held. * Returns number of bufers reaped (for debug) */ @@ -632,8 +642,8 @@ out: * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ @@ -652,13 +662,14 @@ int __journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; + journal_put_journal_head(jh); if (transaction->t_checkpoint_list != NULL || transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the @@ -669,10 +680,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) * The locking here around t_state is a bit sleazy. * See the comment at the end of journal_commit_transaction(). */ - if (transaction->t_state != T_FINISHED) { - JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -684,7 +693,6 @@ int __journal_remove_checkpoint(struct journal_head *jh) wake_up(&journal->j_wait_logspace); ret = 1; out: - JBUFFER_TRACE(jh, "exit"); return ret; } @@ -703,6 +711,8 @@ void __journal_insert_checkpoint(struct journal_head *jh, J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { @@ -752,6 +762,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd_drop_transaction(journal, transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); kfree(transaction); } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 72ffa974b0b..8799207df05 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -21,6 +21,7 @@ #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <trace/events/jbd.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -204,6 +205,8 @@ write_out_data: if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); /* Write out all data to prevent deadlocks */ journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; @@ -236,6 +239,8 @@ write_out_data: jbd_unlock_bh_state(bh); if (bufs == journal->j_wbufsize) { spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; goto write_out_data; @@ -253,10 +258,6 @@ write_out_data: jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); - journal_remove_journal_head(bh); - /* One for our safety reference, other for - * journal_remove_journal_head() */ - put_bh(bh); release_data_buffer(bh); } @@ -266,6 +267,7 @@ write_out_data: } } spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); return err; @@ -316,12 +318,14 @@ void journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + trace_jbd_commit_locking(journal, commit_transaction); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -392,6 +396,7 @@ void journal_commit_transaction(journal_t *journal) */ journal_switch_revoke_table(journal); + trace_jbd_commit_flushing(journal, commit_transaction); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; @@ -446,14 +451,9 @@ void journal_commit_transaction(journal_t *journal) } if (buffer_jbd(bh) && bh2jh(bh) == jh && jh->b_transaction == commit_transaction && - jh->b_jlist == BJ_Locked) { + jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } + jbd_unlock_bh_state(bh); release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } @@ -493,6 +493,7 @@ void journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); + trace_jbd_commit_logging(journal, commit_transaction); J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); @@ -797,10 +798,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); @@ -858,28 +865,27 @@ restart_loop: __journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* The buffer on BJ_Forget list and not jbddirty means + /* + * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget - * list. */ - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __journal_refile_buffer(jh); - if (!jh->b_transaction) { - jbd_unlock_bh_state(bh); - /* needs a brelse */ - journal_remove_journal_head(bh); - release_buffer_page(bh); - } else - jbd_unlock_bh_state(bh); + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); @@ -946,6 +952,7 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_jbd_end_commit(journal, commit_transaction); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index e2d4285fbe9..9fe061fb877 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -38,6 +38,9 @@ #include <linux/debugfs.h> #include <linux/ratelimit.h> +#define CREATE_TRACE_POINTS +#include <trace/events/jbd.h> + #include <asm/uaccess.h> #include <asm/page.h> @@ -1065,6 +1068,7 @@ void journal_update_superblock(journal_t *journal, int wait) } else write_dirty_buffer(bh, WRITE); + trace_jbd_update_superblock_end(journal, wait); out: /* If we have just flushed the log (by marking s_start==0), then * any future commit will have to be careful to update the @@ -1799,10 +1803,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -1815,17 +1818,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = journal_add_journal_head(bh); * ... - * jh->b_transaction = xxx; - * journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. + * (Get another reference for transaction) + * journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * journal_put_journal_head(jh); */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *journal_add_journal_head(struct buffer_head *bh) @@ -1889,61 +1891,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __func__); - jbd_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __func__); - jbd_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void journal_put_journal_head(struct journal_head *jh) @@ -1953,11 +1923,12 @@ void journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index f7ee81a065d..7e59c6e66f9 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -26,6 +26,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> +#include <linux/backing-dev.h> static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -99,11 +100,10 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); + new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS); if (!new_transaction) { - ret = -ENOMEM; - goto out; + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; } } @@ -696,7 +696,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __journal_file_buffer(jh, transaction, BJ_Reserved); @@ -818,7 +817,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); - jh->b_transaction = transaction; /* first access by this transaction */ jh->b_modified = 0; @@ -844,8 +842,8 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); journal_cancel_revoke(handle, jh); - journal_put_journal_head(jh); out: + journal_put_journal_head(jh); return err; } @@ -1069,8 +1067,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) ret = -EIO; goto no_journal; } - - if (jh->b_transaction != NULL) { + /* We might have slept so buffer could be refiled now */ + if (jh->b_transaction != NULL && + jh->b_transaction != handle->h_transaction) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); /* It still points to the committing @@ -1091,8 +1090,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "not on correct data list: unfile"); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; JBUFFER_TRACE(jh, "file as data"); __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); @@ -1300,8 +1297,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) __journal_file_buffer(jh, transaction, BJ_Forget); } else { __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1622,19 +1617,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ void __journal_unfile_buffer(struct journal_head *jh) { __journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + journal_put_journal_head(jh); } void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1661,16 +1669,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __journal_remove_checkpoint(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1733,7 +1737,7 @@ int journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * journal_remove_journal_head() and journal_put_journal_head(). + * journal_put_journal_head(). */ jh = journal_grab_journal_head(bh); if (!jh) @@ -1770,10 +1774,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in @@ -1784,8 +1787,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - journal_remove_journal_head(bh); - __brelse(bh); + __journal_unfile_buffer(jh); } return may_free; } @@ -2070,6 +2072,8 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __journal_temp_unlink_buffer(jh); + else + journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2127,9 +2131,10 @@ void journal_file_buffer(struct journal_head *jh, * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __journal_refile_buffer(struct journal_head *jh) { @@ -2153,6 +2158,11 @@ void __journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __journal_file_buffer() must not take a + * new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; if (buffer_freed(bh)) @@ -2169,30 +2179,21 @@ void __journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __journal_refile_buffer() with necessary locking added. We take our bh + * reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 3675b3cdee8..27c511a1cf0 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -156,7 +156,7 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size) return ERR_PTR(-EINVAL); } -static struct posix_acl *jffs2_get_acl(struct inode *inode, int type) +struct posix_acl *jffs2_get_acl(struct inode *inode, int type) { struct posix_acl *acl; char *value = NULL; @@ -259,30 +259,11 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -int jffs2_check_acl(struct inode *inode, int mask) +int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, mode_t *i_mode) { struct posix_acl *acl; int rc; - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - rc = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return rc; - } - return -EAGAIN; -} - -int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) -{ - struct posix_acl *acl, *clone; - int rc; - cache_no_acl(inode); if (S_ISLNK(*i_mode)) @@ -298,18 +279,13 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) if (S_ISDIR(*i_mode)) set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - if (!clone) - return -ENOMEM; - rc = posix_acl_create_masq(clone, (mode_t *)i_mode); - if (rc < 0) { - posix_acl_release(clone); + rc = posix_acl_create(&acl, GFP_KERNEL, i_mode); + if (rc < 0) return rc; - } if (rc > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, clone); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(clone); + posix_acl_release(acl); } return 0; } @@ -335,7 +311,7 @@ int jffs2_init_acl_post(struct inode *inode) int jffs2_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int rc; if (S_ISLNK(inode->i_mode)) @@ -343,14 +319,11 @@ int jffs2_acl_chmod(struct inode *inode) acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); + rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (rc) + return rc; + rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - rc = posix_acl_chmod_masq(clone, inode->i_mode); - if (!rc) - rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone); - posix_acl_release(clone); return rc; } diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 5e42de8d954..b3421c78d9f 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,9 +26,9 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_check_acl(struct inode *, int); +struct posix_acl *jffs2_get_acl(struct inode *inode, int type); extern int jffs2_acl_chmod(struct inode *); -extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); +extern int jffs2_init_acl_pre(struct inode *, struct inode *, mode_t *); extern int jffs2_init_acl_post(struct inode *); extern const struct xattr_handler jffs2_acl_access_xattr_handler; @@ -36,7 +36,7 @@ extern const struct xattr_handler jffs2_acl_default_xattr_handler; #else -#define jffs2_check_acl (NULL) +#define jffs2_get_acl (NULL) #define jffs2_acl_chmod(inode) (0) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 5f243cd63af..9659b7c0046 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -56,7 +56,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .check_acl = jffs2_check_acl, + .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 3989f7e09f7..61e6723535b 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -63,7 +63,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .check_acl = jffs2_check_acl, + .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 46ad619b612..b81b35ddf4e 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -80,7 +80,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) { jffs2_free_raw_inode(ri); - if (S_ISLNK(inode->i_mode & S_IFMT)) + if (S_ISLNK(inode->i_mode)) kfree(mdata); return ret; } @@ -406,7 +406,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, fill in the raw_inode while you're at it. */ -struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri) +struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, struct jffs2_raw_inode *ri) { struct inode *inode; struct super_block *sb = dir_i->i_sb; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 9c252835e8e..526979c607b 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -173,7 +173,7 @@ int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); void jffs2_dirty_inode(struct inode *inode, int flags); -struct inode *jffs2_new_inode (struct inode *dir_i, int mode, +struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); int jffs2_remount_fs (struct super_block *, int *, char *); diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 2ab1a0d9121..ee57bac1ba6 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1041,7 +1041,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf /* FIXME: point() */ err = jffs2_flash_read(c, ref_offset(ref), len, &retlen, buf); if (err) { - JFFS2_ERROR("can not read %d bytes from 0x%08x, " "error code: %d.\n", len, ref_offset(ref), err); + JFFS2_ERROR("can not read %d bytes from 0x%08x, error code: %d.\n", len, ref_offset(ref), err); goto free_out; } diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index b955626071c..e3035afb181 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -20,7 +20,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .check_acl = jffs2_check_acl, + .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 8a0a0666d5a..b3a32caf2b4 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -27,7 +27,7 @@ #include "jfs_xattr.h" #include "jfs_acl.h" -static struct posix_acl *jfs_get_acl(struct inode *inode, int type) +struct posix_acl *jfs_get_acl(struct inode *inode, int type) { struct posix_acl *acl; char *ea_name; @@ -114,30 +114,9 @@ out: return rc; } -int jfs_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - - return -EAGAIN; -} - int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; - struct posix_acl *clone; - mode_t mode; int rc = 0; if (S_ISLNK(inode->i_mode)) @@ -148,25 +127,18 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) return PTR_ERR(acl); if (acl) { + mode_t mode = inode->i_mode; if (S_ISDIR(inode->i_mode)) { rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); if (rc) goto cleanup; } - clone = posix_acl_clone(acl, GFP_KERNEL); - if (!clone) { - rc = -ENOMEM; - goto cleanup; - } - mode = inode->i_mode; - rc = posix_acl_create_masq(clone, &mode); - if (rc >= 0) { - inode->i_mode = mode; - if (rc > 0) - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, - clone); - } - posix_acl_release(clone); + rc = posix_acl_create(&acl, GFP_KERNEL, &mode); + if (rc < 0) + goto cleanup; /* posix_acl_release(NULL) is no-op */ + inode->i_mode = mode; + if (rc > 0) + rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); cleanup: posix_acl_release(acl); } else @@ -180,8 +152,9 @@ cleanup: int jfs_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int rc; + tid_t tid; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; @@ -190,22 +163,18 @@ int jfs_acl_chmod(struct inode *inode) if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - - rc = posix_acl_chmod_masq(clone, inode->i_mode); - if (!rc) { - tid_t tid = txBegin(inode->i_sb, 0); - mutex_lock(&JFS_IP(inode)->commit_mutex); - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, clone); - if (!rc) - rc = txCommit(tid, 1, &inode, 0); - txEnd(tid); - mutex_unlock(&JFS_IP(inode)->commit_mutex); - } + rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (rc) + return rc; - posix_acl_release(clone); + tid = txBegin(inode->i_sb, 0); + mutex_lock(&JFS_IP(inode)->commit_mutex); + rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + + posix_acl_release(acl); return rc; } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 7527855b5cc..844f9460cb1 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -140,7 +140,7 @@ const struct inode_operations jfs_file_inode_operations = { .removexattr = jfs_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL - .check_acl = jfs_check_acl, + .get_acl = jfs_get_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 54e07559878..ad84fe50ca9 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_check_acl(struct inode *, int); +struct posix_acl *jfs_get_acl(struct inode *inode, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_acl_chmod(struct inode *inode); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 4496872cf4e..9cbd11a3f80 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -3161,7 +3161,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, { int rc; int dbitno, word, rembits, nb, nwords, wbitno, agno; - s8 oldroot, *leaf; + s8 oldroot; struct dmaptree *tp = (struct dmaptree *) & dp->tree; /* save the current value of the root (i.e. maximum free string) @@ -3169,9 +3169,6 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, */ oldroot = tp->stree[ROOT]; - /* pick up a pointer to the leaves of the dmap tree */ - leaf = tp->stree + LEAFIND; - /* determine the bit number and word within the dmap of the * starting block. */ diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index f6cc0c09ec6..af9606057dd 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1143,7 +1143,6 @@ int txCommit(tid_t tid, /* transaction identifier */ struct jfs_log *log; struct tblock *tblk; struct lrd *lrd; - int lsn; struct inode *ip; struct jfs_inode_info *jfs_ip; int k, n; @@ -1310,7 +1309,7 @@ int txCommit(tid_t tid, /* transaction identifier */ */ lrd->type = cpu_to_le16(LOG_COMMIT); lrd->length = 0; - lsn = lmLog(log, tblk, lrd, NULL); + lmLog(log, tblk, lrd, NULL); lmGroupCommit(log, tblk); @@ -2935,7 +2934,6 @@ int jfs_sync(void *arg) { struct inode *ip; struct jfs_inode_info *jfs_ip; - int rc; tid_t tid; do { @@ -2961,7 +2959,7 @@ int jfs_sync(void *arg) */ TXN_UNLOCK(); tid = txBegin(ip->i_sb, COMMIT_INODE); - rc = txCommit(tid, 1, &ip, 0); + txCommit(tid, 1, &ip, 0); txEnd(tid); mutex_unlock(&jfs_ip->commit_mutex); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 03787ef6a11..e17545e1566 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -893,7 +893,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, unchar *i_fastsymlink; s64 xlen = 0; int bmask = 0, xsize; - s64 extent = 0, xaddr; + s64 xaddr; struct metapage *mp; struct super_block *sb; struct tblock *tblk; @@ -993,7 +993,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, txAbort(tid, 0); goto out3; } - extent = xaddr; ip->i_size = ssize - 1; while (ssize) { /* This is kind of silly since PATH_MAX == 4K */ @@ -1537,7 +1536,7 @@ const struct inode_operations jfs_dir_inode_operations = { .removexattr = jfs_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL - .check_acl = jfs_check_acl, + .get_acl = jfs_get_acl, #endif }; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 6e31695d046..f0179c3745d 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -632,7 +632,7 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) /* * This is a callback from the filesystem for VFS file lock requests. - * It will be used if fl_grant is defined and the filesystem can not + * It will be used if lm_grant is defined and the filesystem can not * respond to the request immediately. * For GETLK request it will copy the reply to the nlm_block. * For SETLK or SETLKW request it will get the local posix lock. @@ -719,9 +719,9 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2) } const struct lock_manager_operations nlmsvc_lock_operations = { - .fl_compare_owner = nlmsvc_same_owner, - .fl_notify = nlmsvc_notify_blocked, - .fl_grant = nlmsvc_grant_deferred, + .lm_compare_owner = nlmsvc_same_owner, + .lm_notify = nlmsvc_notify_blocked, + .lm_grant = nlmsvc_grant_deferred, }; /* diff --git a/fs/locks.c b/fs/locks.c index b286539d547..703f545097d 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -160,26 +160,20 @@ EXPORT_SYMBOL_GPL(unlock_flocks); static struct kmem_cache *filelock_cache __read_mostly; -static void locks_init_lock_always(struct file_lock *fl) +static void locks_init_lock_heads(struct file_lock *fl) { - fl->fl_next = NULL; - fl->fl_fasync = NULL; - fl->fl_owner = NULL; - fl->fl_pid = 0; - fl->fl_nspid = NULL; - fl->fl_file = NULL; - fl->fl_flags = 0; - fl->fl_type = 0; - fl->fl_start = fl->fl_end = 0; + INIT_LIST_HEAD(&fl->fl_link); + INIT_LIST_HEAD(&fl->fl_block); + init_waitqueue_head(&fl->fl_wait); } /* Allocate an empty lock structure. */ struct file_lock *locks_alloc_lock(void) { - struct file_lock *fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); + struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); if (fl) - locks_init_lock_always(fl); + locks_init_lock_heads(fl); return fl; } @@ -193,8 +187,8 @@ void locks_release_private(struct file_lock *fl) fl->fl_ops = NULL; } if (fl->fl_lmops) { - if (fl->fl_lmops->fl_release_private) - fl->fl_lmops->fl_release_private(fl); + if (fl->fl_lmops->lm_release_private) + fl->fl_lmops->lm_release_private(fl); fl->fl_lmops = NULL; } @@ -215,27 +209,12 @@ EXPORT_SYMBOL(locks_free_lock); void locks_init_lock(struct file_lock *fl) { - INIT_LIST_HEAD(&fl->fl_link); - INIT_LIST_HEAD(&fl->fl_block); - init_waitqueue_head(&fl->fl_wait); - fl->fl_ops = NULL; - fl->fl_lmops = NULL; - locks_init_lock_always(fl); + memset(fl, 0, sizeof(struct file_lock)); + locks_init_lock_heads(fl); } EXPORT_SYMBOL(locks_init_lock); -/* - * Initialises the fields of the file lock which are invariant for - * free file_locks. - */ -static void init_once(void *foo) -{ - struct file_lock *lock = (struct file_lock *) foo; - - locks_init_lock(lock); -} - static void locks_copy_private(struct file_lock *new, struct file_lock *fl) { if (fl->fl_ops) { @@ -444,9 +423,9 @@ static void lease_release_private_callback(struct file_lock *fl) } static const struct lock_manager_operations lease_manager_ops = { - .fl_break = lease_break_callback, - .fl_release_private = lease_release_private_callback, - .fl_change = lease_modify, + .lm_break = lease_break_callback, + .lm_release_private = lease_release_private_callback, + .lm_change = lease_modify, }; /* @@ -499,9 +478,9 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2) */ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) { - if (fl1->fl_lmops && fl1->fl_lmops->fl_compare_owner) + if (fl1->fl_lmops && fl1->fl_lmops->lm_compare_owner) return fl2->fl_lmops == fl1->fl_lmops && - fl1->fl_lmops->fl_compare_owner(fl1, fl2); + fl1->fl_lmops->lm_compare_owner(fl1, fl2); return fl1->fl_owner == fl2->fl_owner; } @@ -551,8 +530,8 @@ static void locks_wake_up_blocks(struct file_lock *blocker) waiter = list_first_entry(&blocker->fl_block, struct file_lock, fl_block); __locks_delete_block(waiter); - if (waiter->fl_lmops && waiter->fl_lmops->fl_notify) - waiter->fl_lmops->fl_notify(waiter); + if (waiter->fl_lmops && waiter->fl_lmops->lm_notify) + waiter->fl_lmops->lm_notify(waiter); else wake_up(&waiter->fl_wait); } @@ -1239,7 +1218,7 @@ int __break_lease(struct inode *inode, unsigned int mode) fl->fl_type = future; fl->fl_break_time = break_time; /* lease must have lmops break callback */ - fl->fl_lmops->fl_break(fl); + fl->fl_lmops->lm_break(fl); } } @@ -1349,7 +1328,7 @@ int fcntl_getlease(struct file *filp) * @arg: type of lease to obtain * @flp: input - file_lock to use, output - file_lock inserted * - * The (input) flp->fl_lmops->fl_break function is required + * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). * * Called with file_lock_lock held. @@ -1375,7 +1354,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) time_out_leases(inode); - BUG_ON(!(*flp)->fl_lmops->fl_break); + BUG_ON(!(*flp)->fl_lmops->lm_break); if (arg != F_UNLCK) { error = -EAGAIN; @@ -1417,7 +1396,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) goto out; if (my_before != NULL) { - error = lease->fl_lmops->fl_change(my_before, arg); + error = lease->fl_lmops->lm_change(my_before, arg); if (!error) *flp = *my_before; goto out; @@ -1453,7 +1432,7 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) * @lease: file_lock to use * * Call this to establish a lease on the file. - * The (*lease)->fl_lmops->fl_break operation must be set; if not, + * The (*lease)->fl_lmops->lm_break operation must be set; if not, * break_lease will oops! * * This will call the filesystem's setlease file method, if @@ -1751,10 +1730,10 @@ out: * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) - * fl_grant is set. Callers expecting ->lock() to return asynchronously + * lm_grant is set. Callers expecting ->lock() to return asynchronously * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if) * the request is for a blocking lock. When ->lock() does return asynchronously, - * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock + * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock * request completes. * If the request is for non-blocking lock the file system should return * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine @@ -1764,7 +1743,7 @@ out: * grants a lock so the VFS can find out which locks are locally held and do * the correct lock cleanup when required. * The underlying filesystem must not drop the kernel lock or call - * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED + * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED * return code. */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) @@ -2333,8 +2312,8 @@ EXPORT_SYMBOL(lock_may_write); static int __init filelock_init(void) { filelock_cache = kmem_cache_create("file_lock_cache", - sizeof(struct file_lock), 0, SLAB_PANIC, - init_once); + sizeof(struct file_lock), 0, SLAB_PANIC, NULL); + return 0; } diff --git a/fs/namei.c b/fs/namei.c index b7fad009bbf..f8c69d37379 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -32,6 +32,7 @@ #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> +#include <linux/posix_acl.h> #include <asm/uaccess.h> #include "internal.h" @@ -173,12 +174,62 @@ void putname(const char *name) EXPORT_SYMBOL(putname); #endif +static int check_acl(struct inode *inode, int mask) +{ +#ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *acl; + + /* + * Under RCU walk, we cannot even do a "get_cached_acl()", + * because that involves locking and getting a refcount on + * a cached ACL. + * + * So the only case we handle during RCU walking is the + * case of a cached "no ACL at all", which needs no locks + * or refcounts. + */ + if (mask & MAY_NOT_BLOCK) { + if (negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -EAGAIN; + return -ECHILD; + } + + acl = get_cached_acl(inode, ACL_TYPE_ACCESS); + + /* + * A filesystem can force a ACL callback by just never filling the + * ACL cache. But normally you'd fill the cache either at inode + * instantiation time, or on the first ->get_acl call. + * + * If the filesystem doesn't have a get_acl() function at all, we'll + * just create the negative cache entry. + */ + if (acl == ACL_NOT_CACHED) { + if (inode->i_op->get_acl) { + acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } else { + set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); + return -EAGAIN; + } + } + + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } +#endif + + return -EAGAIN; +} + /* * This does basic POSIX ACL permission checking */ static int acl_permission_check(struct inode *inode, int mask) { - int (*check_acl)(struct inode *inode, int mask); unsigned int mode = inode->i_mode; mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; @@ -186,11 +237,10 @@ static int acl_permission_check(struct inode *inode, int mask) if (current_user_ns() != inode_userns(inode)) goto other_perms; - if (current_fsuid() == inode->i_uid) + if (likely(current_fsuid() == inode->i_uid)) mode >>= 6; else { - check_acl = inode->i_op->check_acl; - if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { + if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(inode, mask); if (error != -EAGAIN) return error; diff --git a/fs/namespace.c b/fs/namespace.c index cda50fe9250..22bfe8273c6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2721,6 +2721,25 @@ EXPORT_SYMBOL(put_mnt_ns); struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) { - return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); + struct vfsmount *mnt; + mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); + if (!IS_ERR(mnt)) { + /* + * it is a longterm mount, don't release mnt until + * we unmount before file sys is unregistered + */ + mnt_make_longterm(mnt); + } + return mnt; } EXPORT_SYMBOL_GPL(kern_mount_data); + +void kern_unmount(struct vfsmount *mnt) +{ + /* release long term mount so mount point can be released */ + if (!IS_ERR_OR_NULL(mnt)) { + mnt_make_shortterm(mnt); + mntput(mnt); + } +} +EXPORT_SYMBOL(kern_unmount); diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 76f856e284e..7cf6cafcc00 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -6,7 +6,7 @@ #include <linux/completion.h> #include <linux/sunrpc/cache.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * Deferred request handling diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b35d25b98da..1940f1a56a5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -53,7 +53,7 @@ #include <asm/system.h> #include <asm/uaccess.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "internal.h" #include "iostat.h" diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 79664a1025a..f20801ae0a1 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -36,6 +36,8 @@ #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/nfs_idmap.h> static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) { @@ -59,12 +61,10 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) #ifdef CONFIG_NFS_USE_NEW_IDMAPPER -#include <linux/slab.h> #include <linux/cred.h> #include <linux/sunrpc/sched.h> #include <linux/nfs4.h> #include <linux/nfs_fs_sb.h> -#include <linux/nfs_idmap.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <linux/rcupdate.h> @@ -284,18 +284,15 @@ int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/sched.h> - #include <linux/sunrpc/clnt.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include "nfs4_fs.h" #define IDMAP_HASH_SZ 128 diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 27434277165..e49e73107e6 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -427,16 +427,12 @@ int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, } if (!dfacl) return 0; - acl = posix_acl_clone(dfacl, GFP_KERNEL); - error = -ENOMEM; - if (!acl) - goto out_release_dfacl; - error = posix_acl_create_masq(acl, &mode); + acl = posix_acl_dup(dfacl); + error = posix_acl_create(&acl, GFP_KERNEL, &mode); if (error < 0) - goto out_release_acl; + goto out_release_dfacl; error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? dfacl : NULL); -out_release_acl: posix_acl_release(acl); out_release_dfacl: posix_acl_release(dfacl); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ebed5183f42..b39b37f8091 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1594,8 +1594,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int status; bool sync = true; - if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || - wbc->for_background) + if (wbc->sync_mode == WB_SYNC_NONE) sync = false; status = pnfs_layoutcommit_inode(inode, sync); diff --git a/fs/nfsctl.c b/fs/nfsctl.c deleted file mode 100644 index 124e8fcb0dd..00000000000 --- a/fs/nfsctl.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * fs/nfsctl.c - * - * This should eventually move to userland. - * - */ -#include <linux/types.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/nfsd/syscall.h> -#include <linux/cred.h> -#include <linux/sched.h> -#include <linux/linkage.h> -#include <linux/namei.h> -#include <linux/mount.h> -#include <linux/syscalls.h> -#include <asm/uaccess.h> - -/* - * open a file on nfsd fs - */ - -static struct file *do_open(char *name, int flags) -{ - struct vfsmount *mnt; - struct file *file; - - mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); - if (IS_ERR(mnt)) - return (struct file *)mnt; - - file = file_open_root(mnt->mnt_root, mnt, name, flags); - - mntput(mnt); /* drop do_kern_mount reference */ - return file; -} - -static struct { - char *name; int wsize; int rsize; -} map[] = { - [NFSCTL_SVC] = { - .name = ".svc", - .wsize = sizeof(struct nfsctl_svc) - }, - [NFSCTL_ADDCLIENT] = { - .name = ".add", - .wsize = sizeof(struct nfsctl_client) - }, - [NFSCTL_DELCLIENT] = { - .name = ".del", - .wsize = sizeof(struct nfsctl_client) - }, - [NFSCTL_EXPORT] = { - .name = ".export", - .wsize = sizeof(struct nfsctl_export) - }, - [NFSCTL_UNEXPORT] = { - .name = ".unexport", - .wsize = sizeof(struct nfsctl_export) - }, - [NFSCTL_GETFD] = { - .name = ".getfd", - .wsize = sizeof(struct nfsctl_fdparm), - .rsize = NFS_FHSIZE - }, - [NFSCTL_GETFS] = { - .name = ".getfs", - .wsize = sizeof(struct nfsctl_fsparm), - .rsize = sizeof(struct knfsd_fh) - }, -}; - -SYSCALL_DEFINE3(nfsservctl, int, cmd, struct nfsctl_arg __user *, arg, - void __user *, res) -{ - struct file *file; - void __user *p = &arg->u; - int version; - int err; - - if (copy_from_user(&version, &arg->ca_version, sizeof(int))) - return -EFAULT; - - if (version != NFSCTL_VERSION) - return -EINVAL; - - if (cmd < 0 || cmd >= ARRAY_SIZE(map) || !map[cmd].name) - return -EINVAL; - - file = do_open(map[cmd].name, map[cmd].rsize ? O_RDWR : O_WRONLY); - if (IS_ERR(file)) - return PTR_ERR(file); - err = file->f_op->write(file, p, map[cmd].wsize, &file->f_pos); - if (err >= 0 && map[cmd].rsize) - err = file->f_op->read(file, res, map[cmd].rsize, &file->f_pos); - if (err >= 0) - err = 0; - fput(file); - return err; -} diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index fbb2a5ef581..10e6366608f 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -28,18 +28,6 @@ config NFSD If unsure, say N. -config NFSD_DEPRECATED - bool "Include support for deprecated syscall interface to NFSD" - depends on NFSD - default y - help - The syscall interface to nfsd was obsoleted in 2.6.0 by a new - filesystem based interface. The old interface is due for removal - in 2.6.40. If you wish to remove the interface before then - say N. - - In unsure, say Y. - config NFSD_V2_ACL bool depends on NFSD diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index d892be61016..93cc9d34c45 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -69,7 +69,7 @@ enum { int nfsd_reply_cache_init(void); void nfsd_reply_cache_shutdown(void); -int nfsd_cache_lookup(struct svc_rqst *, int); +int nfsd_cache_lookup(struct svc_rqst *); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); #ifdef CONFIG_NFSD_V4 diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b9566e46219..f4cc1e2bfc5 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -797,58 +797,6 @@ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) return ek; } -#ifdef CONFIG_NFSD_DEPRECATED -static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv, - struct svc_export *exp) -{ - struct svc_expkey key, *ek; - - key.ek_client = clp; - key.ek_fsidtype = fsid_type; - memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); - key.ek_path = exp->ex_path; - key.h.expiry_time = NEVER; - key.h.flags = 0; - - ek = svc_expkey_lookup(&key); - if (ek) - ek = svc_expkey_update(&key,ek); - if (ek) { - cache_put(&ek->h, &svc_expkey_cache); - return 0; - } - return -ENOMEM; -} - -/* - * Find the client's export entry matching xdev/xino. - */ -static inline struct svc_expkey * -exp_get_key(svc_client *clp, dev_t dev, ino_t ino) -{ - u32 fsidv[3]; - - if (old_valid_dev(dev)) { - mk_fsid(FSID_DEV, fsidv, dev, ino, 0, NULL); - return exp_find_key(clp, FSID_DEV, fsidv, NULL); - } - mk_fsid(FSID_ENCODE_DEV, fsidv, dev, ino, 0, NULL); - return exp_find_key(clp, FSID_ENCODE_DEV, fsidv, NULL); -} - -/* - * Find the client's export entry matching fsid - */ -static inline struct svc_expkey * -exp_get_fsid_key(svc_client *clp, int fsid) -{ - u32 fsidv[2]; - - mk_fsid(FSID_NUM, fsidv, 0, 0, fsid, NULL); - - return exp_find_key(clp, FSID_NUM, fsidv, NULL); -} -#endif static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, struct cache_req *reqp) @@ -890,275 +838,7 @@ static struct svc_export *exp_parent(svc_client *clp, struct path *path) return exp; } -#ifdef CONFIG_NFSD_DEPRECATED -/* - * Hashtable locking. Write locks are placed only by user processes - * wanting to modify export information. - * Write locking only done in this file. Read locking - * needed externally. - */ - -static DECLARE_RWSEM(hash_sem); - -void -exp_readlock(void) -{ - down_read(&hash_sem); -} - -static inline void -exp_writelock(void) -{ - down_write(&hash_sem); -} - -void -exp_readunlock(void) -{ - up_read(&hash_sem); -} - -static inline void -exp_writeunlock(void) -{ - up_write(&hash_sem); -} -#else - -/* hash_sem not needed once deprecated interface is removed */ -void exp_readlock(void) {} -static inline void exp_writelock(void){} -void exp_readunlock(void) {} -static inline void exp_writeunlock(void){} - -#endif - -#ifdef CONFIG_NFSD_DEPRECATED -static void exp_do_unexport(svc_export *unexp); -static int exp_verify_string(char *cp, int max); - -static void exp_fsid_unhash(struct svc_export *exp) -{ - struct svc_expkey *ek; - - if ((exp->ex_flags & NFSEXP_FSID) == 0) - return; - - ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid); - if (!IS_ERR(ek)) { - sunrpc_invalidate(&ek->h, &svc_expkey_cache); - cache_put(&ek->h, &svc_expkey_cache); - } -} - -static int exp_fsid_hash(svc_client *clp, struct svc_export *exp) -{ - u32 fsid[2]; - - if ((exp->ex_flags & NFSEXP_FSID) == 0) - return 0; - - mk_fsid(FSID_NUM, fsid, 0, 0, exp->ex_fsid, NULL); - return exp_set_key(clp, FSID_NUM, fsid, exp); -} - -static int exp_hash(struct auth_domain *clp, struct svc_export *exp) -{ - u32 fsid[2]; - struct inode *inode = exp->ex_path.dentry->d_inode; - dev_t dev = inode->i_sb->s_dev; - - if (old_valid_dev(dev)) { - mk_fsid(FSID_DEV, fsid, dev, inode->i_ino, 0, NULL); - return exp_set_key(clp, FSID_DEV, fsid, exp); - } - mk_fsid(FSID_ENCODE_DEV, fsid, dev, inode->i_ino, 0, NULL); - return exp_set_key(clp, FSID_ENCODE_DEV, fsid, exp); -} -static void exp_unhash(struct svc_export *exp) -{ - struct svc_expkey *ek; - struct inode *inode = exp->ex_path.dentry->d_inode; - - ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino); - if (!IS_ERR(ek)) { - sunrpc_invalidate(&ek->h, &svc_expkey_cache); - cache_put(&ek->h, &svc_expkey_cache); - } -} - -/* - * Export a file system. - */ -int -exp_export(struct nfsctl_export *nxp) -{ - svc_client *clp; - struct svc_export *exp = NULL; - struct svc_export new; - struct svc_expkey *fsid_key = NULL; - struct path path; - int err; - - /* Consistency check */ - err = -EINVAL; - if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) || - !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX)) - goto out; - - dprintk("exp_export called for %s:%s (%x/%ld fl %x).\n", - nxp->ex_client, nxp->ex_path, - (unsigned)nxp->ex_dev, (long)nxp->ex_ino, - nxp->ex_flags); - - /* Try to lock the export table for update */ - exp_writelock(); - - /* Look up client info */ - if (!(clp = auth_domain_find(nxp->ex_client))) - goto out_unlock; - - - /* Look up the dentry */ - err = kern_path(nxp->ex_path, 0, &path); - if (err) - goto out_put_clp; - err = -EINVAL; - - exp = exp_get_by_name(clp, &path, NULL); - - memset(&new, 0, sizeof(new)); - - /* must make sure there won't be an ex_fsid clash */ - if ((nxp->ex_flags & NFSEXP_FSID) && - (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) && - fsid_key->ek_path.mnt && - (fsid_key->ek_path.mnt != path.mnt || - fsid_key->ek_path.dentry != path.dentry)) - goto finish; - - if (!IS_ERR(exp)) { - /* just a flags/id/fsid update */ - - exp_fsid_unhash(exp); - exp->ex_flags = nxp->ex_flags; - exp->ex_anon_uid = nxp->ex_anon_uid; - exp->ex_anon_gid = nxp->ex_anon_gid; - exp->ex_fsid = nxp->ex_dev; - - err = exp_fsid_hash(clp, exp); - goto finish; - } - - err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL); - if (err) goto finish; - - err = -ENOMEM; - - dprintk("nfsd: creating export entry %p for client %p\n", exp, clp); - - new.h.expiry_time = NEVER; - new.h.flags = 0; - new.ex_pathname = kstrdup(nxp->ex_path, GFP_KERNEL); - if (!new.ex_pathname) - goto finish; - new.ex_client = clp; - new.ex_path = path; - new.ex_flags = nxp->ex_flags; - new.ex_anon_uid = nxp->ex_anon_uid; - new.ex_anon_gid = nxp->ex_anon_gid; - new.ex_fsid = nxp->ex_dev; - - exp = svc_export_lookup(&new); - if (exp) - exp = svc_export_update(&new, exp); - - if (!exp) - goto finish; - - if (exp_hash(clp, exp) || - exp_fsid_hash(clp, exp)) { - /* failed to create at least one index */ - exp_do_unexport(exp); - cache_flush(); - } else - err = 0; -finish: - kfree(new.ex_pathname); - if (!IS_ERR_OR_NULL(exp)) - exp_put(exp); - if (!IS_ERR_OR_NULL(fsid_key)) - cache_put(&fsid_key->h, &svc_expkey_cache); - path_put(&path); -out_put_clp: - auth_domain_put(clp); -out_unlock: - exp_writeunlock(); -out: - return err; -} - -/* - * Unexport a file system. The export entry has already - * been removed from the client's list of exported fs's. - */ -static void -exp_do_unexport(svc_export *unexp) -{ - sunrpc_invalidate(&unexp->h, &svc_export_cache); - exp_unhash(unexp); - exp_fsid_unhash(unexp); -} - - -/* - * unexport syscall. - */ -int -exp_unexport(struct nfsctl_export *nxp) -{ - struct auth_domain *dom; - svc_export *exp; - struct path path; - int err; - - /* Consistency check */ - if (!exp_verify_string(nxp->ex_path, NFS_MAXPATHLEN) || - !exp_verify_string(nxp->ex_client, NFSCLNT_IDMAX)) - return -EINVAL; - - exp_writelock(); - - err = -EINVAL; - dom = auth_domain_find(nxp->ex_client); - if (!dom) { - dprintk("nfsd: unexport couldn't find %s\n", nxp->ex_client); - goto out_unlock; - } - - err = kern_path(nxp->ex_path, 0, &path); - if (err) - goto out_domain; - - err = -EINVAL; - exp = exp_get_by_name(dom, &path, NULL); - path_put(&path); - if (IS_ERR(exp)) - goto out_domain; - - exp_do_unexport(exp); - exp_put(exp); - err = 0; - -out_domain: - auth_domain_put(dom); - cache_flush(); -out_unlock: - exp_writeunlock(); - return err; -} -#endif /* CONFIG_NFSD_DEPRECATED */ /* * Obtain the root fh on behalf of a client. @@ -1367,7 +1047,6 @@ static void *e_start(struct seq_file *m, loff_t *pos) unsigned hash, export; struct cache_head *ch; - exp_readlock(); read_lock(&svc_export_cache.hash_lock); if (!n--) return SEQ_START_TOKEN; @@ -1418,7 +1097,6 @@ static void e_stop(struct seq_file *m, void *p) __releases(svc_export_cache.hash_lock) { read_unlock(&svc_export_cache.hash_lock); - exp_readunlock(); } static struct flags { @@ -1550,97 +1228,6 @@ const struct seq_operations nfs_exports_op = { .show = e_show, }; -#ifdef CONFIG_NFSD_DEPRECATED -/* - * Add or modify a client. - * Change requests may involve the list of host addresses. The list of - * exports and possibly existing uid maps are left untouched. - */ -int -exp_addclient(struct nfsctl_client *ncp) -{ - struct auth_domain *dom; - int i, err; - struct in6_addr addr6; - - /* First, consistency check. */ - err = -EINVAL; - if (! exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX)) - goto out; - if (ncp->cl_naddr > NFSCLNT_ADDRMAX) - goto out; - - /* Lock the hashtable */ - exp_writelock(); - - dom = unix_domain_find(ncp->cl_ident); - - err = -ENOMEM; - if (!dom) - goto out_unlock; - - /* Insert client into hashtable. */ - for (i = 0; i < ncp->cl_naddr; i++) { - ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6); - auth_unix_add_addr(&init_net, &addr6, dom); - } - auth_unix_forget_old(dom); - auth_domain_put(dom); - - err = 0; - -out_unlock: - exp_writeunlock(); -out: - return err; -} - -/* - * Delete a client given an identifier. - */ -int -exp_delclient(struct nfsctl_client *ncp) -{ - int err; - struct auth_domain *dom; - - err = -EINVAL; - if (!exp_verify_string(ncp->cl_ident, NFSCLNT_IDMAX)) - goto out; - - /* Lock the hashtable */ - exp_writelock(); - - dom = auth_domain_find(ncp->cl_ident); - /* just make sure that no addresses work - * and that it will expire soon - */ - if (dom) { - err = auth_unix_forget_old(dom); - auth_domain_put(dom); - } - - exp_writeunlock(); -out: - return err; -} - -/* - * Verify that string is non-empty and does not exceed max length. - */ -static int -exp_verify_string(char *cp, int max) -{ - int i; - - for (i = 0; i < max; i++) - if (!cp[i]) - return i; - cp[i] = 0; - printk(KERN_NOTICE "nfsd: couldn't validate string %s\n", cp); - return 0; -} -#endif /* CONFIG_NFSD_DEPRECATED */ /* * Initialize the exports module. @@ -1667,10 +1254,8 @@ nfsd_export_init(void) void nfsd_export_flush(void) { - exp_writelock(); cache_purge(&svc_expkey_cache); cache_purge(&svc_export_cache); - exp_writeunlock(); } /* @@ -1682,12 +1267,9 @@ nfsd_export_shutdown(void) dprintk("nfsd: shutting down export module.\n"); - exp_writelock(); - cache_unregister(&svc_expkey_cache); cache_unregister(&svc_export_cache); svcauth_unix_purge(); - exp_writeunlock(); dprintk("nfsd: export shutdown complete.\n"); } diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 7c831a2731f..77e7a5cca88 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -35,10 +35,8 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); fh.fh_export = NULL; - exp_readlock(); nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); fh_put(&fh); - exp_readunlock(); /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 3a6dbd70b34..e8077766661 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -291,6 +291,15 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; + /* + * RFC5661 18.51.3 + * Before RECLAIM_COMPLETE done, server should deny new lock + */ + if (nfsd4_has_session(cstate) && + !cstate->session->se_client->cl_firststate && + open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + return nfserr_grace; + if (nfsd4_has_session(cstate)) copy_clientid(&open->op_clientid, cstate->session); @@ -998,6 +1007,15 @@ struct nfsd4_operation { nfsd4op_func op_func; u32 op_flags; char *op_name; + /* + * We use the DRC for compounds containing non-idempotent + * operations, *except* those that are 4.1-specific (since + * sessions provide their own EOS), and except for stateful + * operations other than setclientid and setclientid_confirm + * (since sequence numbers provide EOS for open, lock, etc in + * the v4.0 case). + */ + bool op_cacheresult; }; static struct nfsd4_operation nfsd4_ops[]; @@ -1042,6 +1060,11 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) return &nfsd4_ops[op->opnum]; } +bool nfsd4_cache_this_op(struct nfsd4_op *op) +{ + return OPDESC(op)->op_cacheresult; +} + static bool need_wrongsec_check(struct svc_rqst *rqstp) { struct nfsd4_compoundres *resp = rqstp->rq_resp; @@ -1209,7 +1232,6 @@ encode_op: fh_put(&resp->cstate.save_fh); BUG_ON(resp->cstate.replay_owner); out: - nfsd4_release_compoundargs(args); /* Reset deferral mechanism for RPC deferrals */ rqstp->rq_usedeferral = 1; dprintk("nfsv4 compound returned %d\n", ntohl(status)); @@ -1232,6 +1254,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_CREATE] = { .op_func = (nfsd4op_func)nfsd4_create, .op_name = "OP_CREATE", + .op_cacheresult = true, }, [OP_DELEGRETURN] = { .op_func = (nfsd4op_func)nfsd4_delegreturn, @@ -1249,6 +1272,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, .op_name = "OP_LINK", + .op_cacheresult = true, }, [OP_LOCK] = { .op_func = (nfsd4op_func)nfsd4_lock, @@ -1322,10 +1346,12 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, .op_name = "OP_REMOVE", + .op_cacheresult = true, }, [OP_RENAME] = { .op_name = "OP_RENAME", .op_func = (nfsd4op_func)nfsd4_rename, + .op_cacheresult = true, }, [OP_RENEW] = { .op_func = (nfsd4op_func)nfsd4_renew, @@ -1351,16 +1377,19 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, .op_name = "OP_SETATTR", + .op_cacheresult = true, }, [OP_SETCLIENTID] = { .op_func = (nfsd4op_func)nfsd4_setclientid, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, .op_name = "OP_SETCLIENTID", + .op_cacheresult = true, }, [OP_SETCLIENTID_CONFIRM] = { .op_func = (nfsd4op_func)nfsd4_setclientid_confirm, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, .op_name = "OP_SETCLIENTID_CONFIRM", + .op_cacheresult = true, }, [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, @@ -1369,6 +1398,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, .op_name = "OP_WRITE", + .op_cacheresult = true, }, [OP_RELEASE_LOCKOWNER] = { .op_func = (nfsd4op_func)nfsd4_release_lockowner, @@ -1402,6 +1432,11 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, .op_name = "OP_SEQUENCE", }, + [OP_DESTROY_CLIENTID] = { + .op_func = NULL, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_DESTROY_CLIENTID", + }, [OP_RECLAIM_COMPLETE] = { .op_func = (nfsd4op_func)nfsd4_reclaim_complete, .op_flags = ALLOWED_WITHOUT_FH, @@ -1412,6 +1447,16 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", }, + [OP_TEST_STATEID] = { + .op_func = (nfsd4op_func)nfsd4_test_stateid, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_TEST_STATEID", + }, + [OP_FREE_STATEID] = { + .op_func = (nfsd4op_func)nfsd4_free_stateid, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_FREE_STATEID", + }, }; static const char *nfsd4_op_name(unsigned opnum) @@ -1424,16 +1469,6 @@ static const char *nfsd4_op_name(unsigned opnum) #define nfsd4_voidres nfsd4_voidargs struct nfsd4_voidargs { int dummy; }; -/* - * TODO: At the present time, the NFSv4 server does not do XID caching - * of requests. Implementing XID caching would not be a serious problem, - * although it would require a mild change in interfaces since one - * doesn't know whether an NFSv4 request is idempotent until after the - * XDR decode. However, XID caching totally confuses pynfs (Peter - * Astrand's regression testsuite for NFSv4 servers), which reuses - * XID's liberally, so I've left it unimplemented until pynfs generates - * better XID's. - */ static struct svc_procedure nfsd_procedures4[2] = { [NFSPROC4_NULL] = { .pc_func = (svc_procfunc) nfsd4_proc_null, @@ -1449,6 +1484,7 @@ static struct svc_procedure nfsd_procedures4[2] = { .pc_encode = (kxdrproc_t) nfs4svc_encode_compoundres, .pc_argsize = sizeof(struct nfsd4_compoundargs), .pc_ressize = sizeof(struct nfsd4_compoundres), + .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, .pc_xdrressize = NFSD_BUFSIZE/4, }, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e98f3c2e949..3787ec11740 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/swap.h> +#include <linux/pagemap.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/clnt.h> #include "xdr4.h" @@ -60,9 +61,12 @@ static u64 current_sessionid = 1; /* forward declarations */ static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +static struct nfs4_stateid * search_for_stateid(stateid_t *stid); +static struct nfs4_delegation * search_for_delegation(stateid_t *stid); static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static void nfs4_set_recdir(char *recdir); +static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner); /* Locking: */ @@ -381,14 +385,6 @@ static int nfs4_access_to_omode(u32 access) BUG(); } -static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp) -{ - unsigned int access; - - set_access(&access, stp->st_access_bmap); - return nfs4_access_to_omode(access); -} - static void unhash_generic_stateid(struct nfs4_stateid *stp) { list_del(&stp->st_hash); @@ -398,11 +394,14 @@ static void unhash_generic_stateid(struct nfs4_stateid *stp) static void free_generic_stateid(struct nfs4_stateid *stp) { - int oflag; + int i; if (stp->st_access_bmap) { - oflag = nfs4_access_bmap_to_omode(stp); - nfs4_file_put_access(stp->st_file, oflag); + for (i = 1; i < 4; i++) { + if (test_bit(i, &stp->st_access_bmap)) + nfs4_file_put_access(stp->st_file, + nfs4_access_to_omode(i)); + } } put_nfs4_file(stp->st_file); kmem_cache_free(stateid_slab, stp); @@ -1507,6 +1506,29 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, return slot->sl_status; } +#define NFSD_MIN_REQ_HDR_SEQ_SZ ((\ + 2 * 2 + /* credential,verifier: AUTH_NULL, length 0 */ \ + 1 + /* MIN tag is length with zero, only length */ \ + 3 + /* version, opcount, opcode */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + /* seqid, slotID, slotID, cache */ \ + 4 ) * sizeof(__be32)) + +#define NFSD_MIN_RESP_HDR_SEQ_SZ ((\ + 2 + /* verifier: AUTH_NULL, length 0 */\ + 1 + /* status */ \ + 1 + /* MIN tag is length with zero, only length */ \ + 3 + /* opcount, opcode, opstatus*/ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + /* seqid, slotID, slotID, slotID, status */ \ + 5 ) * sizeof(__be32)) + +static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs fchannel) +{ + return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ + || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ; +} + __be32 nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -1575,6 +1597,10 @@ nfsd4_create_session(struct svc_rqst *rqstp, cr_ses->flags &= ~SESSION4_PERSIST; cr_ses->flags &= ~SESSION4_RDMA; + status = nfserr_toosmall; + if (check_forechannel_attrs(cr_ses->fore_channel)) + goto out; + status = nfserr_jukebox; new = alloc_init_session(rqstp, conf, cr_ses); if (!new) @@ -1736,6 +1762,14 @@ static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_sess return args->opcnt > session->se_fchannel.maxops; } +static bool nfsd4_request_too_big(struct svc_rqst *rqstp, + struct nfsd4_session *session) +{ + struct xdr_buf *xb = &rqstp->rq_arg; + + return xb->len > session->se_fchannel.maxreq_sz; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -1768,6 +1802,10 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (nfsd4_session_too_many_ops(rqstp, session)) goto out; + status = nfserr_req_too_big; + if (nfsd4_request_too_big(rqstp, session)) + goto out; + status = nfserr_badslot; if (seq->slotid >= session->se_fchannel.maxreqs) goto out; @@ -2337,15 +2375,6 @@ out: return ret; } -static inline void -nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) -{ - if (share_access & NFS4_SHARE_ACCESS_WRITE) - nfs4_file_put_access(fp, O_WRONLY); - if (share_access & NFS4_SHARE_ACCESS_READ) - nfs4_file_put_access(fp, O_RDONLY); -} - static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { /* We're assuming the state code never drops its reference @@ -2396,8 +2425,8 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) } static const struct lock_manager_operations nfsd_lease_mng_ops = { - .fl_break = nfsd_break_deleg_cb, - .fl_change = nfsd_change_deleg_cb, + .lm_break = nfsd_break_deleg_cb, + .lm_change = nfsd_change_deleg_cb, }; @@ -2556,12 +2585,18 @@ static inline int nfs4_access_to_access(u32 nfs4_access) return flags; } -static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file -*fp, struct svc_fh *cur_fh, u32 nfs4_access) +static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfsd4_open *open) { __be32 status; - int oflag = nfs4_access_to_omode(nfs4_access); - int access = nfs4_access_to_access(nfs4_access); + int oflag = nfs4_access_to_omode(open->op_share_access); + int access = nfs4_access_to_access(open->op_share_access); + + /* CLAIM_DELEGATE_CUR is used in response to a broken lease; + * allowing it to break the lease and return EAGAIN leaves the + * client unable to make progress in returning the delegation */ + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + access |= NFSD_MAY_NOT_BREAK_LEASE; if (!fp->fi_fds[oflag]) { status = nfsd_open(rqstp, cur_fh, S_IFREG, access, @@ -2586,7 +2621,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, if (stp == NULL) return nfserr_resource; - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access); + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); if (status) { kmem_cache_free(stateid_slab, stp); return status; @@ -2619,14 +2654,14 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c new_access = !test_bit(op_share_access, &stp->st_access_bmap); if (new_access) { - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access); + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); if (status) return status; } status = nfsd4_truncate(rqstp, cur_fh, open); if (status) { if (new_access) { - int oflag = nfs4_access_to_omode(new_access); + int oflag = nfs4_access_to_omode(op_share_access); nfs4_file_put_access(fp, oflag); } return status; @@ -3137,6 +3172,37 @@ static int is_delegation_stateid(stateid_t *stateid) return stateid->si_fileid == 0; } +static int is_open_stateid(struct nfs4_stateid *stateid) +{ + return stateid->st_openstp == NULL; +} + +__be32 nfs4_validate_stateid(stateid_t *stateid, int flags) +{ + struct nfs4_stateid *stp = NULL; + __be32 status = nfserr_stale_stateid; + + if (STALE_STATEID(stateid)) + goto out; + + status = nfserr_expired; + stp = search_for_stateid(stateid); + if (!stp) + goto out; + status = nfserr_bad_stateid; + + if (!stp->st_stateowner->so_confirmed) + goto out; + + status = check_stateid_generation(stateid, &stp->st_stateid, flags); + if (status) + goto out; + + status = nfs_ok; +out: + return status; +} + /* * Checks for stateid operations */ @@ -3216,6 +3282,81 @@ out: return status; } +static __be32 +nfsd4_free_delegation_stateid(stateid_t *stateid) +{ + struct nfs4_delegation *dp = search_for_delegation(stateid); + if (dp) + return nfserr_locks_held; + return nfserr_bad_stateid; +} + +static __be32 +nfsd4_free_lock_stateid(struct nfs4_stateid *stp) +{ + if (check_for_locks(stp->st_file, stp->st_stateowner)) + return nfserr_locks_held; + release_lock_stateid(stp); + return nfs_ok; +} + +/* + * Test if the stateid is valid + */ +__be32 +nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_test_stateid *test_stateid) +{ + test_stateid->ts_has_session = nfsd4_has_session(cstate); + return nfs_ok; +} + +/* + * Free a state id + */ +__be32 +nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_free_stateid *free_stateid) +{ + stateid_t *stateid = &free_stateid->fr_stateid; + struct nfs4_stateid *stp; + __be32 ret; + + nfs4_lock_state(); + if (is_delegation_stateid(stateid)) { + ret = nfsd4_free_delegation_stateid(stateid); + goto out; + } + + stp = search_for_stateid(stateid); + if (!stp) { + ret = nfserr_bad_stateid; + goto out; + } + if (stateid->si_generation != 0) { + if (stateid->si_generation < stp->st_stateid.si_generation) { + ret = nfserr_old_stateid; + goto out; + } + if (stateid->si_generation > stp->st_stateid.si_generation) { + ret = nfserr_bad_stateid; + goto out; + } + } + + if (is_open_stateid(stp)) { + ret = nfserr_locks_held; + goto out; + } else { + ret = nfsd4_free_lock_stateid(stp); + goto out; + } + +out: + nfs4_unlock_state(); + return ret; +} + static inline int setlkflg (int type) { @@ -3384,18 +3525,15 @@ out: return status; } - -/* - * unset all bits in union bitmap (bmap) that - * do not exist in share (from successful OPEN_DOWNGRADE) - */ -static void -reset_union_bmap_access(unsigned long access, unsigned long *bmap) +static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access) { int i; + for (i = 1; i < 4; i++) { - if ((i & access) != i) - __clear_bit(i, bmap); + if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) { + nfs4_file_put_access(stp->st_file, i); + __clear_bit(i, &stp->st_access_bmap); + } } } @@ -3416,7 +3554,6 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, { __be32 status; struct nfs4_stateid *stp; - unsigned int share_access; dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", (int)cstate->current_fh.fh_dentry->d_name.len, @@ -3445,10 +3582,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, stp->st_deny_bmap, od->od_share_deny); goto out; } - set_access(&share_access, stp->st_access_bmap); - nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access); + nfs4_file_downgrade(stp, od->od_share_access); - reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap); reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); update_stateid(&stp->st_stateid); @@ -3594,6 +3729,14 @@ static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; +static int +same_stateid(stateid_t *id_one, stateid_t *id_two) +{ + if (id_one->si_stateownerid != id_two->si_stateownerid) + return 0; + return id_one->si_fileid == id_two->si_fileid; +} + static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags) { @@ -3623,6 +3766,44 @@ find_stateid(stateid_t *stid, int flags) return NULL; } +static struct nfs4_stateid * +search_for_stateid(stateid_t *stid) +{ + struct nfs4_stateid *local; + unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid); + + list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { + if (same_stateid(&local->st_stateid, stid)) + return local; + } + + list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { + if (same_stateid(&local->st_stateid, stid)) + return local; + } + return NULL; +} + +static struct nfs4_delegation * +search_for_delegation(stateid_t *stid) +{ + struct nfs4_file *fp; + struct nfs4_delegation *dp; + struct list_head *pos; + int i; + + for (i = 0; i < FILE_HASH_SIZE; i++) { + list_for_each_entry(fp, &file_hashtbl[i], fi_hash) { + list_for_each(pos, &fp->fi_delegations) { + dp = list_entry(pos, struct nfs4_delegation, dl_perfile); + if (same_stateid(&dp->dl_stateid, stid)) + return dp; + } + } + } + return NULL; +} + static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid) { diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 99018110321..c8bf405d19d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -44,13 +44,15 @@ #include <linux/namei.h> #include <linux/statfs.h> #include <linux/utsname.h> +#include <linux/pagemap.h> #include <linux/sunrpc/svcauth_gss.h> #include "idmap.h" #include "acl.h" #include "xdr4.h" #include "vfs.h" - +#include "state.h" +#include "cache.h" #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -131,6 +133,22 @@ xdr_error: \ } \ } while (0) +static void save_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep) +{ + savep->p = argp->p; + savep->end = argp->end; + savep->pagelen = argp->pagelen; + savep->pagelist = argp->pagelist; +} + +static void restore_buf(struct nfsd4_compoundargs *argp, struct nfsd4_saved_compoundargs *savep) +{ + argp->p = savep->p; + argp->end = savep->end; + argp->pagelen = savep->pagelen; + argp->pagelist = savep->pagelist; +} + static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) { /* We want more bytes than seem to be available. @@ -1246,6 +1264,19 @@ nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, } static __be32 +nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, + struct nfsd4_free_stateid *free_stateid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + READ32(free_stateid->fr_stateid.si_generation); + COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, struct nfsd4_sequence *seq) { @@ -1261,6 +1292,40 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, DECODE_TAIL; } +static __be32 +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +{ + unsigned int nbytes; + stateid_t si; + int i; + __be32 *p; + __be32 status; + + READ_BUF(4); + test_stateid->ts_num_ids = ntohl(*p++); + + nbytes = test_stateid->ts_num_ids * sizeof(stateid_t); + if (nbytes > (u32)((char *)argp->end - (char *)argp->p)) + goto xdr_error; + + test_stateid->ts_saved_args = argp; + save_buf(argp, &test_stateid->ts_savedp); + + for (i = 0; i < test_stateid->ts_num_ids; i++) { + status = nfsd4_decode_stateid(argp, &si); + if (status) + return status; + } + + status = 0; +out: + return status; +xdr_error: + dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__); + status = nfserr_bad_xdr; + goto out; +} + static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) { DECODE_HEAD; @@ -1370,7 +1435,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -1380,7 +1445,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, @@ -1402,6 +1467,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) DECODE_HEAD; struct nfsd4_op *op; struct nfsd4_minorversion_ops *ops; + bool cachethis = false; int i; /* @@ -1483,7 +1549,16 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->opcnt = i+1; break; } + /* + * We'll try to cache the result in the DRC if any one + * op in the compound wants to be cached: + */ + cachethis |= nfsd4_cache_this_op(op); } + /* Sessions make the DRC unnecessary: */ + if (argp->minorversion) + cachethis = false; + argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; DECODE_TAIL; } @@ -3116,6 +3191,21 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr, } static __be32 +nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_free_stateid *free_stateid) +{ + __be32 *p; + + if (nfserr) + return nfserr; + + RESERVE_SPACE(4); + WRITE32(nfserr); + ADJUST_ARGS(); + return nfserr; +} + +static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_sequence *seq) { @@ -3138,6 +3228,36 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, return 0; } +__be32 +nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_test_stateid *test_stateid) +{ + struct nfsd4_compoundargs *argp; + stateid_t si; + __be32 *p; + int i; + int valid; + + restore_buf(test_stateid->ts_saved_args, &test_stateid->ts_savedp); + argp = test_stateid->ts_saved_args; + + RESERVE_SPACE(4); + *p++ = htonl(test_stateid->ts_num_ids); + resp->p = p; + + nfs4_lock_state(); + for (i = 0; i < test_stateid->ts_num_ids; i++) { + nfsd4_decode_stateid(argp, &si); + valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session); + RESERVE_SPACE(4); + *p++ = htonl(valid); + resp->p = p; + } + nfs4_unlock_state(); + + return nfserr; +} + static __be32 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) { @@ -3196,7 +3316,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_free_stateid, [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, @@ -3206,7 +3326,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, - [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid, [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, @@ -3319,8 +3439,11 @@ nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) return xdr_ressize_check(rqstp, p); } -void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args) +int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp) { + struct svc_rqst *rqstp = rq; + struct nfsd4_compoundargs *args = rqstp->rq_argp; + if (args->ops != args->iops) { kfree(args->ops); args->ops = args->iops; @@ -3333,13 +3456,12 @@ void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args) tb->release(tb->buf); kfree(tb); } + return 1; } int nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args) { - __be32 status; - args->p = p; args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; args->pagelist = rqstp->rq_arg.pages; @@ -3349,11 +3471,7 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_comp args->ops = args->iops; args->rqstp = rqstp; - status = nfsd4_decode_compound(args); - if (status) { - nfsd4_release_compoundargs(args); - } - return !status; + return !nfsd4_decode_compound(args); } int diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 4666a209678..2cbac34a55d 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -118,7 +118,7 @@ hash_refile(struct svc_cacherep *rp) * Note that no operation within the loop may sleep. */ int -nfsd_cache_lookup(struct svc_rqst *rqstp, int type) +nfsd_cache_lookup(struct svc_rqst *rqstp) { struct hlist_node *hn; struct hlist_head *rh; @@ -128,6 +128,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type) vers = rqstp->rq_vers, proc = rqstp->rq_proc; unsigned long age; + int type = rqstp->rq_cachetype; int rtn; rqstp->rq_cacherep = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2b1449dd2f4..c7716143cbd 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -24,15 +24,6 @@ */ enum { NFSD_Root = 1, -#ifdef CONFIG_NFSD_DEPRECATED - NFSD_Svc, - NFSD_Add, - NFSD_Del, - NFSD_Export, - NFSD_Unexport, - NFSD_Getfd, - NFSD_Getfs, -#endif NFSD_List, NFSD_Export_features, NFSD_Fh, @@ -59,15 +50,6 @@ enum { /* * write() for these nodes. */ -#ifdef CONFIG_NFSD_DEPRECATED -static ssize_t write_svc(struct file *file, char *buf, size_t size); -static ssize_t write_add(struct file *file, char *buf, size_t size); -static ssize_t write_del(struct file *file, char *buf, size_t size); -static ssize_t write_export(struct file *file, char *buf, size_t size); -static ssize_t write_unexport(struct file *file, char *buf, size_t size); -static ssize_t write_getfd(struct file *file, char *buf, size_t size); -static ssize_t write_getfs(struct file *file, char *buf, size_t size); -#endif static ssize_t write_filehandle(struct file *file, char *buf, size_t size); static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); @@ -83,15 +65,6 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif static ssize_t (*write_op[])(struct file *, char *, size_t) = { -#ifdef CONFIG_NFSD_DEPRECATED - [NFSD_Svc] = write_svc, - [NFSD_Add] = write_add, - [NFSD_Del] = write_del, - [NFSD_Export] = write_export, - [NFSD_Unexport] = write_unexport, - [NFSD_Getfd] = write_getfd, - [NFSD_Getfs] = write_getfs, -#endif [NFSD_Fh] = write_filehandle, [NFSD_FO_UnlockIP] = write_unlock_ip, [NFSD_FO_UnlockFS] = write_unlock_fs, @@ -130,16 +103,6 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { -#ifdef CONFIG_NFSD_DEPRECATED - static int warned; - if (file->f_dentry->d_name.name[0] == '.' && !warned) { - printk(KERN_INFO - "Warning: \"%s\" uses deprecated NFSD interface: %s." - " This will be removed in 2.6.40\n", - current->comm, file->f_dentry->d_name.name); - warned = 1; - } -#endif if (! file->private_data) { /* An attempt to read a transaction file without writing * causes a 0-byte write so that the file can return @@ -226,303 +189,6 @@ static const struct file_operations pool_stats_operations = { * payload - write methods */ -#ifdef CONFIG_NFSD_DEPRECATED -/** - * write_svc - Start kernel's NFSD server - * - * Deprecated. /proc/fs/nfsd/threads is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_svc - * svc_port: port number of this - * server's listener - * svc_nthreads: number of threads to start - * size: size in bytes of passed in nfsctl_svc - * Output: - * On success: returns zero - * On error: return code is negative errno value - */ -static ssize_t write_svc(struct file *file, char *buf, size_t size) -{ - struct nfsctl_svc *data; - int err; - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_svc*) buf; - err = nfsd_svc(data->svc_port, data->svc_nthreads); - if (err < 0) - return err; - return 0; -} - -/** - * write_add - Add or modify client entry in auth unix cache - * - * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_client - * cl_ident: '\0'-terminated C string - * containing domain name - * of client - * cl_naddr: no. of items in cl_addrlist - * cl_addrlist: array of client addresses - * cl_fhkeytype: ignored - * cl_fhkeylen: ignored - * cl_fhkey: ignored - * size: size in bytes of passed in nfsctl_client - * Output: - * On success: returns zero - * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in, since - * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. - */ -static ssize_t write_add(struct file *file, char *buf, size_t size) -{ - struct nfsctl_client *data; - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_client *)buf; - return exp_addclient(data); -} - -/** - * write_del - Remove client from auth unix cache - * - * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_client - * cl_ident: '\0'-terminated C string - * containing domain name - * of client - * cl_naddr: ignored - * cl_addrlist: ignored - * cl_fhkeytype: ignored - * cl_fhkeylen: ignored - * cl_fhkey: ignored - * size: size in bytes of passed in nfsctl_client - * Output: - * On success: returns zero - * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in, since - * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. - */ -static ssize_t write_del(struct file *file, char *buf, size_t size) -{ - struct nfsctl_client *data; - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_client *)buf; - return exp_delclient(data); -} - -/** - * write_export - Export part or all of a local file system - * - * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_export - * ex_client: '\0'-terminated C string - * containing domain name - * of client allowed to access - * this export - * ex_path: '\0'-terminated C string - * containing pathname of - * directory in local file system - * ex_dev: fsid to use for this export - * ex_ino: ignored - * ex_flags: export flags for this export - * ex_anon_uid: UID to use for anonymous - * requests - * ex_anon_gid: GID to use for anonymous - * requests - * size: size in bytes of passed in nfsctl_export - * Output: - * On success: returns zero - * On error: return code is negative errno value - */ -static ssize_t write_export(struct file *file, char *buf, size_t size) -{ - struct nfsctl_export *data; - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_export*)buf; - return exp_export(data); -} - -/** - * write_unexport - Unexport a previously exported file system - * - * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_export - * ex_client: '\0'-terminated C string - * containing domain name - * of client no longer allowed - * to access this export - * ex_path: '\0'-terminated C string - * containing pathname of - * directory in local file system - * ex_dev: ignored - * ex_ino: ignored - * ex_flags: ignored - * ex_anon_uid: ignored - * ex_anon_gid: ignored - * size: size in bytes of passed in nfsctl_export - * Output: - * On success: returns zero - * On error: return code is negative errno value - */ -static ssize_t write_unexport(struct file *file, char *buf, size_t size) -{ - struct nfsctl_export *data; - - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_export*)buf; - return exp_unexport(data); -} - -/** - * write_getfs - Get a variable-length NFS file handle by path - * - * Deprecated. /proc/fs/nfsd/filehandle is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_fsparm - * gd_addr: socket address of client - * gd_path: '\0'-terminated C string - * containing pathname of - * directory in local file system - * gd_maxlen: maximum size of returned file - * handle - * size: size in bytes of passed in nfsctl_fsparm - * Output: - * On success: passed-in buffer filled with a knfsd_fh structure - * (a variable-length raw NFS file handle); - * return code is the size in bytes of the file handle - * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in, since gd_addr - * is the same size as a struct sockaddr_in. - */ -static ssize_t write_getfs(struct file *file, char *buf, size_t size) -{ - struct nfsctl_fsparm *data; - struct sockaddr_in *sin; - struct auth_domain *clp; - int err = 0; - struct knfsd_fh *res; - struct in6_addr in6; - - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_fsparm*)buf; - err = -EPROTONOSUPPORT; - if (data->gd_addr.sa_family != AF_INET) - goto out; - sin = (struct sockaddr_in *)&data->gd_addr; - if (data->gd_maxlen > NFS3_FHSIZE) - data->gd_maxlen = NFS3_FHSIZE; - - res = (struct knfsd_fh*)buf; - - exp_readlock(); - - ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); - - clp = auth_unix_lookup(&init_net, &in6); - if (!clp) - err = -EPERM; - else { - err = exp_rootfh(clp, data->gd_path, res, data->gd_maxlen); - auth_domain_put(clp); - } - exp_readunlock(); - if (err == 0) - err = res->fh_size + offsetof(struct knfsd_fh, fh_base); - out: - return err; -} - -/** - * write_getfd - Get a fixed-length NFS file handle by path (used by mountd) - * - * Deprecated. /proc/fs/nfsd/filehandle is preferred. - * Function remains to support old versions of nfs-utils. - * - * Input: - * buf: struct nfsctl_fdparm - * gd_addr: socket address of client - * gd_path: '\0'-terminated C string - * containing pathname of - * directory in local file system - * gd_version: fdparm structure version - * size: size in bytes of passed in nfsctl_fdparm - * Output: - * On success: passed-in buffer filled with nfsctl_res - * (a fixed-length raw NFS file handle); - * return code is the size in bytes of the file handle - * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in, since gd_addr - * is the same size as a struct sockaddr_in. - */ -static ssize_t write_getfd(struct file *file, char *buf, size_t size) -{ - struct nfsctl_fdparm *data; - struct sockaddr_in *sin; - struct auth_domain *clp; - int err = 0; - struct knfsd_fh fh; - char *res; - struct in6_addr in6; - - if (size < sizeof(*data)) - return -EINVAL; - data = (struct nfsctl_fdparm*)buf; - err = -EPROTONOSUPPORT; - if (data->gd_addr.sa_family != AF_INET) - goto out; - err = -EINVAL; - if (data->gd_version < 2 || data->gd_version > NFSSVC_MAXVERS) - goto out; - - res = buf; - sin = (struct sockaddr_in *)&data->gd_addr; - exp_readlock(); - - ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); - - clp = auth_unix_lookup(&init_net, &in6); - if (!clp) - err = -EPERM; - else { - err = exp_rootfh(clp, data->gd_path, &fh, NFS_FHSIZE); - auth_domain_put(clp); - } - exp_readunlock(); - - if (err == 0) { - memset(res,0, NFS_FHSIZE); - memcpy(res, &fh.fh_base, fh.fh_size); - err = NFS_FHSIZE; - } - out: - return err; -} -#endif /* CONFIG_NFSD_DEPRECATED */ /** * write_unlock_ip - Release all locks used by a client @@ -1397,15 +1063,6 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) static int nfsd_fill_super(struct super_block * sb, void * data, int silent) { static struct tree_descr nfsd_files[] = { -#ifdef CONFIG_NFSD_DEPRECATED - [NFSD_Svc] = {".svc", &transaction_ops, S_IWUSR}, - [NFSD_Add] = {".add", &transaction_ops, S_IWUSR}, - [NFSD_Del] = {".del", &transaction_ops, S_IWUSR}, - [NFSD_Export] = {".export", &transaction_ops, S_IWUSR}, - [NFSD_Unexport] = {".unexport", &transaction_ops, S_IWUSR}, - [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, - [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, -#endif [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 18743c4d8bc..dc5a1bf476b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -528,16 +528,9 @@ nfsd(void *vrqstp) continue; } - - /* Lock the export hash tables for reading. */ - exp_readlock(); - validate_process_creds(); svc_process(rqstp); validate_process_creds(); - - /* Unlock export hash tables */ - exp_readunlock(); } /* Clear signals before calling svc_exit_thread() */ @@ -577,8 +570,22 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) rqstp->rq_vers, rqstp->rq_proc); proc = rqstp->rq_procinfo; + /* + * Give the xdr decoder a chance to change this if it wants + * (necessary in the NFSv4.0 compound case) + */ + rqstp->rq_cachetype = proc->pc_cachetype; + /* Decode arguments */ + xdr = proc->pc_decode; + if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base, + rqstp->rq_argp)) { + dprintk("nfsd: failed to decode arguments!\n"); + *statp = rpc_garbage_args; + return 1; + } + /* Check whether we have this call in the cache. */ - switch (nfsd_cache_lookup(rqstp, proc->pc_cachetype)) { + switch (nfsd_cache_lookup(rqstp)) { case RC_INTR: case RC_DROPIT: return 0; @@ -588,16 +595,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* do it */ } - /* Decode arguments */ - xdr = proc->pc_decode; - if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base, - rqstp->rq_argp)) { - dprintk("nfsd: failed to decode arguments!\n"); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); - *statp = rpc_garbage_args; - return 1; - } - /* need to grab the location to store the status, as * nfsv4 does some encoding while processing */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 6bd2f3c21f2..4eefaf1b42e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -482,6 +482,7 @@ extern void nfsd4_recdir_purge_old(void); extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); +extern __be32 nfs4_validate_stateid(stateid_t *, int); static inline void nfs4_put_stateowner(struct nfs4_stateowner *so) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 366401e1a53..d2a8d04428c 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -342,6 +342,25 @@ struct nfsd4_setclientid_confirm { nfs4_verifier sc_confirm; }; +struct nfsd4_saved_compoundargs { + __be32 *p; + __be32 *end; + int pagelen; + struct page **pagelist; +}; + +struct nfsd4_test_stateid { + __be32 ts_num_ids; + __be32 ts_has_session; + struct nfsd4_compoundargs *ts_saved_args; + struct nfsd4_saved_compoundargs ts_savedp; +}; + +struct nfsd4_free_stateid { + stateid_t fr_stateid; /* request */ + __be32 fr_status; /* response */ +}; + /* also used for NVERIFY */ struct nfsd4_verify { u32 ve_bmval[3]; /* request */ @@ -432,10 +451,14 @@ struct nfsd4_op { struct nfsd4_destroy_session destroy_session; struct nfsd4_sequence sequence; struct nfsd4_reclaim_complete reclaim_complete; + struct nfsd4_test_stateid test_stateid; + struct nfsd4_free_stateid free_stateid; } u; struct nfs4_replay * replay; }; +bool nfsd4_cache_this_op(struct nfsd4_op *); + struct nfsd4_compoundargs { /* scratch variables for XDR decode */ __be32 * p; @@ -458,6 +481,7 @@ struct nfsd4_compoundargs { u32 opcnt; struct nfsd4_op *ops; struct nfsd4_op iops[8]; + int cachetype; }; struct nfsd4_compoundres { @@ -559,11 +583,15 @@ extern __be32 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_release_lockowner *rlockowner); -extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *); +extern int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp); extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr); extern __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *, clientid_t *clid); +extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid); +extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid); #endif /* diff --git a/fs/notify/group.c b/fs/notify/group.c index d309f38449c..63fc294a469 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -26,7 +26,7 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -#include <asm/atomic.h> +#include <linux/atomic.h> /* * Final freeing of a group diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 07ea8d3e6ea..b13c00ac48e 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -23,7 +23,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 252ab1f6452..e14587d5568 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -92,7 +92,7 @@ #include <linux/spinlock.h> #include <linux/srcu.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/notify/notification.c b/fs/notify/notification.c index f39260f8f86..ee188158a22 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -43,7 +43,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index e86577d6c5c..778fe6cae3b 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -24,7 +24,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 2dabf813456..fe8e7e92888 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -24,7 +24,7 @@ #ifndef _LINUX_NTFS_INODE_H #define _LINUX_NTFS_INODE_H -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fs.h> #include <linux/list.h> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 1cee970eb55..783c58d9daf 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -290,47 +290,32 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_check_acl(struct inode *inode, int mask) +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) { struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; int ret = -EAGAIN; - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return ret; + return NULL; ret = ocfs2_read_inode_block(inode, &di_bh); - if (ret < 0) { - mlog_errno(ret); - return ret; - } + if (ret < 0) + return ERR_PTR(ret); - acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh); + acl = ocfs2_get_acl_nolock(inode, type, di_bh); brelse(di_bh); - if (IS_ERR(acl)) { - mlog_errno(PTR_ERR(acl)); - return PTR_ERR(acl); - } - if (acl) { - ret = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return ret; - } - - return -EAGAIN; + return acl; } int ocfs2_acl_chmod(struct inode *inode) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl, *clone; + struct posix_acl *acl; int ret; if (S_ISLNK(inode->i_mode)) @@ -342,15 +327,12 @@ int ocfs2_acl_chmod(struct inode *inode) acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); + ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + acl, NULL, NULL); posix_acl_release(acl); - if (!clone) - return -ENOMEM; - ret = posix_acl_chmod_masq(clone, inode->i_mode); - if (!ret) - ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, - clone, NULL, NULL); - posix_acl_release(clone); return ret; } @@ -388,8 +370,6 @@ int ocfs2_init_acl(handle_t *handle, } } if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { - struct posix_acl *clone; - if (S_ISDIR(inode->i_mode)) { ret = ocfs2_set_acl(handle, inode, di_bh, ACL_TYPE_DEFAULT, acl, @@ -397,27 +377,22 @@ int ocfs2_init_acl(handle_t *handle, if (ret) goto cleanup; } - clone = posix_acl_clone(acl, GFP_NOFS); - ret = -ENOMEM; - if (!clone) - goto cleanup; - mode = inode->i_mode; - ret = posix_acl_create_masq(clone, &mode); - if (ret >= 0) { - ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); - if (ret2) { - mlog_errno(ret2); - ret = ret2; - goto cleanup; - } - if (ret > 0) { - ret = ocfs2_set_acl(handle, inode, - di_bh, ACL_TYPE_ACCESS, - clone, meta_ac, data_ac); - } + ret = posix_acl_create(&acl, GFP_NOFS, &mode); + if (ret < 0) + return ret; + + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + acl, meta_ac, data_ac); } - posix_acl_release(clone); } cleanup: posix_acl_release(acl); diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 5c5d31f0585..071fbd380f2 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -26,7 +26,7 @@ struct ocfs2_acl_entry { __le32 e_id; }; -extern int ocfs2_check_acl(struct inode *, int); +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0fc2bd34039..de4ea1af041 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2600,14 +2600,14 @@ const struct inode_operations ocfs2_file_iops = { .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, - .check_acl = ocfs2_check_acl, + .get_acl = ocfs2_iop_get_acl, }; const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .check_acl = ocfs2_check_acl, + .get_acl = ocfs2_iop_get_acl, }; /* diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index cd9427023d2..d53cb706f14 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -36,7 +36,6 @@ #include "dir.h" #include "buffer_head_io.h" #include "sysfile.h" -#include "suballoc.h" #include "refcounttree.h" #include "move_extents.h" diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 33889dc52dd..53aa41ed7bf 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2498,5 +2498,5 @@ const struct inode_operations ocfs2_dir_iops = { .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, - .check_acl = ocfs2_check_acl, + .get_acl = ocfs2_iop_get_acl, }; diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 3b8d3979e03..98e54427439 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -93,7 +93,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb) memset(bh->b_data, 0, sizeof(struct omfs_inode)); - if (inode->i_mode & S_IFDIR) { + if (S_ISDIR(inode->i_mode)) { memset(&bh->b_data[OMFS_DIR_START], 0xff, sbi->s_sys_blocksize - OMFS_DIR_START); } else diff --git a/fs/open.c b/fs/open.c index 739b751aa73..f7119210945 100644 --- a/fs/open.c +++ b/fs/open.c @@ -446,74 +446,52 @@ out: return error; } -SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +static int chmod_common(struct path *path, umode_t mode) { - struct inode * inode; - struct dentry * dentry; - struct file * file; - int err = -EBADF; + struct inode *inode = path->dentry->d_inode; struct iattr newattrs; + int error; - file = fget(fd); - if (!file) - goto out; - - dentry = file->f_path.dentry; - inode = dentry->d_inode; - - audit_inode(NULL, dentry); - - err = mnt_want_write_file(file); - if (err) - goto out_putf; + error = mnt_want_write(path->mnt); + if (error) + return error; mutex_lock(&inode->i_mutex); - err = security_path_chmod(dentry, file->f_vfsmnt, mode); - if (err) + error = security_path_chmod(path->dentry, path->mnt, mode); + if (error) goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); + error = notify_change(path->dentry, &newattrs); out_unlock: mutex_unlock(&inode->i_mutex); - mnt_drop_write(file->f_path.mnt); -out_putf: - fput(file); -out: + mnt_drop_write(path->mnt); + return error; +} + +SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +{ + struct file * file; + int err = -EBADF; + + file = fget(fd); + if (file) { + audit_inode(NULL, file->f_path.dentry); + err = chmod_common(&file->f_path, mode); + fput(file); + } return err; } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) { struct path path; - struct inode *inode; int error; - struct iattr newattrs; error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); - if (error) - goto out; - inode = path.dentry->d_inode; - - error = mnt_want_write(path.mnt); - if (error) - goto dput_and_out; - mutex_lock(&inode->i_mutex); - error = security_path_chmod(path.dentry, path.mnt, mode); - if (error) - goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path.dentry, &newattrs); -out_unlock: - mutex_unlock(&inode->i_mutex); - mnt_drop_write(path.mnt); -dput_and_out: - path_put(&path); -out: + if (!error) { + error = chmod_common(&path, mode); + path_put(&path); + } return error; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index d545e97d99c..e3c63d1c5e1 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -237,22 +237,22 @@ ssize_t part_size_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } -ssize_t part_ro_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t part_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%d\n", p->policy ? 1 : 0); } -ssize_t part_alignment_offset_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t part_alignment_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); } -ssize_t part_discard_alignment_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t part_discard_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%u\n", p->discard_alignment); diff --git a/fs/pipe.c b/fs/pipe.c index da42f7db50d..0e0be1dc0f8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -948,7 +948,7 @@ static const struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { - struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) @@ -1291,8 +1291,8 @@ static int __init init_pipe_fs(void) static void __exit exit_pipe_fs(void) { + kern_unmount(pipe_mnt); unregister_filesystem(&pipe_fs_type); - mntput(pipe_mnt); } fs_initcall(init_pipe_fs); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index b1cf6bf4b41..d43729a760e 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -14,7 +14,7 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/posix_acl.h> @@ -24,13 +24,9 @@ EXPORT_SYMBOL(posix_acl_init); EXPORT_SYMBOL(posix_acl_alloc); -EXPORT_SYMBOL(posix_acl_clone); EXPORT_SYMBOL(posix_acl_valid); EXPORT_SYMBOL(posix_acl_equiv_mode); EXPORT_SYMBOL(posix_acl_from_mode); -EXPORT_SYMBOL(posix_acl_create_masq); -EXPORT_SYMBOL(posix_acl_chmod_masq); -EXPORT_SYMBOL(posix_acl_permission); /* * Init a fresh posix_acl @@ -59,7 +55,7 @@ posix_acl_alloc(int count, gfp_t flags) /* * Clone an ACL. */ -struct posix_acl * +static struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; @@ -283,8 +279,7 @@ check_perm: * system calls. All permissions that are not granted by the acl are removed. * The permissions in the acl are changed to reflect the mode_p parameter. */ -int -posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) +static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) { struct posix_acl_entry *pa, *pe; struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; @@ -341,8 +336,7 @@ posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) /* * Modify the ACL for the chmod syscall. */ -int -posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) +static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; @@ -386,3 +380,39 @@ posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) return 0; } + +int +posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p) +{ + struct posix_acl *clone = posix_acl_clone(*acl, gfp); + int err = -ENOMEM; + if (clone) { + err = posix_acl_create_masq(clone, mode_p); + if (err < 0) { + posix_acl_release(clone); + clone = NULL; + } + } + posix_acl_release(*acl); + *acl = clone; + return err; +} +EXPORT_SYMBOL(posix_acl_create); + +int +posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, mode_t mode) +{ + struct posix_acl *clone = posix_acl_clone(*acl, gfp); + int err = -ENOMEM; + if (clone) { + err = posix_acl_chmod_masq(clone, mode); + if (err) { + posix_acl_release(clone); + clone = NULL; + } + } + posix_acl_release(*acl); + *acl = clone; + return err; +} +EXPORT_SYMBOL(posix_acl_chmod); diff --git a/fs/proc/base.c b/fs/proc/base.c index 91fb655a5cb..08e3eccf9a1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1118,10 +1118,9 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, * Warn that /proc/pid/oom_adj is deprecated, see * Documentation/feature-removal-schedule.txt. */ - printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, " - "please use /proc/%d/oom_score_adj instead.\n", - current->comm, task_pid_nr(current), - task_pid_nr(task), task_pid_nr(task)); + WARN_ONCE(1, "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); task->signal->oom_adj = oom_adjust; /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum @@ -2707,9 +2706,16 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { struct task_io_accounting acct = task->ioac; unsigned long flags; + int result; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; + result = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (result) + return result; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + result = -EACCES; + goto out_unlock; + } if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; @@ -2720,7 +2726,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) unlock_task_sighand(task, &flags); } - return sprintf(buffer, + result = sprintf(buffer, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" @@ -2735,6 +2741,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) (unsigned long long)acct.read_bytes, (unsigned long long)acct.write_bytes, (unsigned long long)acct.cancelled_write_bytes); +out_unlock: + mutex_unlock(&task->signal->cred_guard_mutex); + return result; } static int proc_tid_io_accounting(struct task_struct *task, char *buffer) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f1637f17c37..9d99131d0d6 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -620,8 +620,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, if (!ent) goto out; memset(ent, 0, sizeof(struct proc_dir_entry)); - memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); - ent->name = ((char *) ent) + sizeof(*ent); + memcpy(ent->name, fn, len + 1); ent->namelen = len; ent->mode = mode; ent->nlink = nlink; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 74b48cfa1bb..7ed72d6c1c6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -319,7 +319,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!pde->proc_fops) { spin_unlock(&pde->pde_unload_lock); kfree(pdeo); - return -EINVAL; + return -ENOENT; } pde->pde_users++; open = pde->proc_fops->open; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index ed257d14156..586174168e2 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -10,7 +10,7 @@ #include <linux/seq_file.h> #include <linux/swap.h> #include <linux/vmstat.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/page.h> #include <asm/pgtable.h> #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 9020ac15baa..f738024ccc8 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -197,15 +197,15 @@ static __net_init int proc_net_ns_init(struct net *net) int err; err = -ENOMEM; - netd = kzalloc(sizeof(*netd), GFP_KERNEL); + netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); if (!netd) goto out; netd->data = net; netd->nlink = 2; - netd->name = "net"; netd->namelen = 3; netd->parent = &proc_root; + memcpy(netd->name, "net", 4); err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); diff --git a/fs/proc/root.c b/fs/proc/root.c index d6c3b416529..9a8a2b77b87 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -186,13 +186,13 @@ static const struct inode_operations proc_root_inode_operations = { struct proc_dir_entry proc_root = { .low_ino = PROC_ROOT_INO, .namelen = 5, - .name = "/proc", .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, .count = ATOMIC_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, + .name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) diff --git a/fs/read_write.c b/fs/read_write.c index 5907b49e4d7..179f1c33ea5 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -166,8 +166,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) * long as offset isn't at the end of the file then the * offset is data. */ - if (offset >= inode->i_size) - return -ENXIO; + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } break; case SEEK_HOLE: /* @@ -175,8 +177,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) * as long as offset isn't i_size or larger, return * i_size. */ - if (offset >= inode->i_size) - return -ENXIO; + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } offset = inode->i_size; break; } diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 483442e66ed..d1aca1df4f9 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -214,7 +214,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, } /* otherwise we clear all bit were set ... */ while (--i >= *beg) - reiserfs_test_and_clear_le_bit + reiserfs_clear_le_bit (i, bh->b_data); reiserfs_restore_prepared_buffer(s, bh); *beg = org; @@ -1222,15 +1222,11 @@ void reiserfs_cache_bitmap_metadata(struct super_block *sb, info->free_count = 0; while (--cur >= (unsigned long *)bh->b_data) { - int i; - /* 0 and ~0 are special, we can optimize for them */ if (*cur == 0) info->free_count += BITS_PER_LONG; else if (*cur != ~0L) /* A mix, investigate */ - for (i = BITS_PER_LONG - 1; i >= 0; i--) - if (!reiserfs_test_le_bit(i, cur)) - info->free_count++; + info->free_count += BITS_PER_LONG - hweight_long(*cur); } } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index c7156dc39ce..ace635053a3 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -319,5 +319,5 @@ const struct inode_operations reiserfs_file_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2922b90ceac..9b0d4b78b4f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1475,6 +1475,11 @@ void reiserfs_read_locked_inode(struct inode *inode, reiserfs_check_path(&path_to_sd); /* init inode should be relsing */ + /* + * Stat data v1 doesn't support ACLs. + */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) + cache_no_acl(inode); } /** diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index c5e82ece7c6..a159ba5a35e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -678,23 +678,19 @@ struct buffer_chunk { static void write_chunk(struct buffer_chunk *chunk) { int i; - get_fs_excl(); for (i = 0; i < chunk->nr; i++) { submit_logged_buffer(chunk->bh[i]); } chunk->nr = 0; - put_fs_excl(); } static void write_ordered_chunk(struct buffer_chunk *chunk) { int i; - get_fs_excl(); for (i = 0; i < chunk->nr; i++) { submit_ordered_buffer(chunk->bh[i]); } chunk->nr = 0; - put_fs_excl(); } static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, @@ -986,8 +982,6 @@ static int flush_commit_list(struct super_block *s, return 0; } - get_fs_excl(); - /* before we can put our commit blocks on disk, we have to make sure everyone older than ** us is on disk too */ @@ -1145,7 +1139,6 @@ static int flush_commit_list(struct super_block *s, if (retval) reiserfs_abort(s, retval, "Journal write error in %s", __func__); - put_fs_excl(); return retval; } @@ -1374,8 +1367,6 @@ static int flush_journal_list(struct super_block *s, return 0; } - get_fs_excl(); - /* if all the work is already done, get out of here */ if (atomic_read(&(jl->j_nonzerolen)) <= 0 && atomic_read(&(jl->j_commit_left)) <= 0) { @@ -1597,7 +1588,6 @@ static int flush_journal_list(struct super_block *s, put_journal_list(s, jl); if (flushall) mutex_unlock(&journal->j_flush_mutex); - put_fs_excl(); return err; } @@ -3108,7 +3098,6 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, th->t_trans_id = journal->j_trans_id; unlock_journal(sb); INIT_LIST_HEAD(&th->t_list); - get_fs_excl(); return 0; out_fail: @@ -3964,7 +3953,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, flush = flags & FLUSH_ALL; wait_on_commit = flags & WAIT; - put_fs_excl(); current->journal_info = th->t_handle_save; reiserfs_check_lock_depth(sb, "journal end"); if (journal->j_len == 0) { @@ -4316,4 +4304,3 @@ void reiserfs_abort_journal(struct super_block *sb, int errno) dump_stack(); #endif } - diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 551f1b79dbc..ef392324bbf 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1529,7 +1529,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; /* @@ -1546,7 +1546,7 @@ const struct inode_operations reiserfs_symlink_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; @@ -1560,5 +1560,5 @@ const struct inode_operations reiserfs_special_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .check_acl = reiserfs_check_acl, + .get_acl = reiserfs_get_acl, }; diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index b3a94d20f0f..b6b9b1fe33b 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -136,7 +136,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) return -EIO; } memset(bh->b_data, 0, sb_blocksize(sb)); - reiserfs_test_and_set_le_bit(0, bh->b_data); + reiserfs_set_le_bit(0, bh->b_data); reiserfs_cache_bitmap_metadata(s, bh, bitmap + i); set_buffer_uptodate(bh); @@ -172,7 +172,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) reiserfs_prepare_for_journal(s, bh, 1); for (i = block_r; i < s->s_blocksize * 8; i++) - reiserfs_test_and_clear_le_bit(i, bh->b_data); + reiserfs_clear_le_bit(i, bh->b_data); info->free_count += s->s_blocksize * 8 - block_r; journal_mark_dirty(&th, s, bh); @@ -190,7 +190,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) reiserfs_prepare_for_journal(s, bh, 1); for (i = block_r_new; i < s->s_blocksize * 8; i++) - reiserfs_test_and_set_le_bit(i, bh->b_data); + reiserfs_set_le_bit(i, bh->b_data); journal_mark_dirty(&th, s, bh); brelse(bh); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 6938d8c68d6..6bc346c160e 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -867,33 +867,6 @@ out: return err; } -int reiserfs_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl; - int error = -EAGAIN; /* do regular unix permission checks by default */ - - /* - * Stat data v1 doesn't support ACLs. - */ - if (get_inode_sd_version(inode) == STAT_DATA_V1) - return -EAGAIN; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - - if (acl) { - if (!IS_ERR(acl)) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } else if (PTR_ERR(acl) != -ENODATA) - error = PTR_ERR(acl); - } - - return error; -} - static int create_privroot(struct dentry *dentry) { int err; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 3dc38f1206f..7362cf4c946 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -354,9 +354,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, return PTR_ERR(acl); if (acl) { - struct posix_acl *acl_copy; mode_t mode = inode->i_mode; - int need_acl; /* Copy the default ACL to the default ACL of a new directory */ if (S_ISDIR(inode->i_mode)) { @@ -368,29 +366,15 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, /* Now we reconcile the new ACL and the mode, potentially modifying both */ - acl_copy = posix_acl_clone(acl, GFP_NOFS); - if (!acl_copy) { - err = -ENOMEM; - goto cleanup; - } + err = posix_acl_create(&acl, GFP_NOFS, &mode); + if (err < 0) + return err; - need_acl = posix_acl_create_masq(acl_copy, &mode); - if (need_acl >= 0) { - if (mode != inode->i_mode) { - inode->i_mode = mode; - } + inode->i_mode = mode; - /* If we need an ACL.. */ - if (need_acl > 0) { - err = reiserfs_set_acl(th, inode, - ACL_TYPE_ACCESS, - acl_copy); - if (err) - goto cleanup_copy; - } - } - cleanup_copy: - posix_acl_release(acl_copy); + /* If we need an ACL.. */ + if (err > 0) + err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); cleanup: posix_acl_release(acl); } else { @@ -445,7 +429,10 @@ int reiserfs_cache_default_acl(struct inode *inode) int reiserfs_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct reiserfs_transaction_handle th; + struct posix_acl *acl; + size_t size; + int depth; int error; if (S_ISLNK(inode->i_mode)) @@ -463,30 +450,22 @@ int reiserfs_acl_chmod(struct inode *inode) return 0; if (IS_ERR(acl)) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_NOFS); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); + error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode); + if (error) + return error; + + size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count)); + depth = reiserfs_write_lock_once(inode->i_sb); + error = journal_begin(&th, inode->i_sb, size * 2); if (!error) { - struct reiserfs_transaction_handle th; - size_t size = reiserfs_xattr_nblocks(inode, - reiserfs_acl_size(clone->a_count)); - int depth; - - depth = reiserfs_write_lock_once(inode->i_sb); - error = journal_begin(&th, inode->i_sb, size * 2); - if (!error) { - int error2; - error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, - clone); - error2 = journal_end(&th, inode->i_sb, size * 2); - if (error2) - error = error2; - } - reiserfs_write_unlock_once(inode->i_sb, depth); + int error2; + error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl); + error2 = journal_end(&th, inode->i_sb, size * 2); + if (error2) + error = error2; } - posix_acl_release(clone); + reiserfs_write_unlock_once(inode->i_sb, depth); + posix_acl_release(acl); return error; } diff --git a/fs/splice.c b/fs/splice.c index aa866d30969..fa2defa8afc 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -132,7 +132,7 @@ error: return err; } -static const struct pipe_buf_operations page_cache_pipe_buf_ops = { +const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -264,7 +264,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, return ret; } -static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) +void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { page_cache_release(spd->pages[i]); } diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 7797218d0b3..1360d4f88f4 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -1,7 +1,6 @@ config SQUASHFS tristate "SquashFS 4.0 - Squashed file system support" depends on BLOCK - select ZLIB_INFLATE help Saying Y here includes support for SquashFS 4.0 (a Compressed Read-Only File System). Squashfs is a highly compressed read-only @@ -36,6 +35,19 @@ config SQUASHFS_XATTR If unsure, say N. +config SQUASHFS_ZLIB + bool "Include support for ZLIB compressed file systems" + depends on SQUASHFS + select ZLIB_INFLATE + default y + help + ZLIB compression is the standard compression used by Squashfs + file systems. It offers a good trade-off between compression + achieved and the amount of CPU time and memory necessary to + compress and decompress. + + If unsure, say Y. + config SQUASHFS_LZO bool "Include support for LZO compressed file systems" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index cecf2bea07a..110b0476f3b 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,7 +4,8 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o +squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 9f1b0bb96f1..3f6271d86ab 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -52,6 +52,12 @@ static const struct squashfs_decompressor squashfs_xz_comp_ops = { }; #endif +#ifndef CONFIG_SQUASHFS_ZLIB +static const struct squashfs_decompressor squashfs_zlib_comp_ops = { + NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 +}; +#endif + static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, 0, "unknown", 0 }; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 8ba70cff09a..330073e2902 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -56,4 +56,8 @@ extern const struct squashfs_decompressor squashfs_xz_comp_ops; extern const struct squashfs_decompressor squashfs_lzo_comp_ops; #endif +#ifdef CONFIG_SQUASHFS_ZLIB +extern const struct squashfs_decompressor squashfs_zlib_comp_ops; +#endif + #endif diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index e3be6a71cfa..d1266516ed0 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -97,6 +97,3 @@ extern const struct inode_operations squashfs_symlink_inode_ops; /* xattr.c */ extern const struct xattr_handler *squashfs_xattr_handlers[]; - -/* zlib_wrapper.c */ -extern const struct squashfs_decompressor squashfs_zlib_comp_ops; diff --git a/fs/super.c b/fs/super.c index 7943f04cb3a..3f56a269a4f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -351,13 +351,11 @@ bool grab_super_passive(struct super_block *sb) */ void lock_super(struct super_block * sb) { - get_fs_excl(); mutex_lock(&sb->s_lock); } void unlock_super(struct super_block * sb) { - put_fs_excl(); mutex_unlock(&sb->s_lock); } @@ -385,7 +383,6 @@ void generic_shutdown_super(struct super_block *sb) if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); - get_fs_excl(); sb->s_flags &= ~MS_ACTIVE; fsnotify_unmount_inodes(&sb->s_inodes); @@ -400,7 +397,6 @@ void generic_shutdown_super(struct super_block *sb) "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); } - put_fs_excl(); } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index cac48fe22ad..44ce5165680 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -114,6 +114,8 @@ xfs_get_acl(struct inode *inode, int type) if (acl != ACL_NOT_CACHED) return acl; + trace_xfs_get_acl(ip); + switch (type) { case ACL_TYPE_ACCESS: ea_name = SGI_ACL_FILE; @@ -218,40 +220,6 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -int -xfs_check_acl(struct inode *inode, int mask) -{ - struct xfs_inode *ip; - struct posix_acl *acl; - int error = -EAGAIN; - - ip = XFS_I(inode); - trace_xfs_check_acl(ip); - - /* - * If there is no attribute fork no ACL exists on this inode and - * we can skip the whole exercise. - */ - if (!XFS_IFORK_Q(ip)) - return -EAGAIN; - - if (mask & MAY_NOT_BLOCK) { - if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) - return -ECHILD; - return -EAGAIN; - } - - acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - } - - return error; -} - static int xfs_set_mode(struct inode *inode, mode_t mode) { @@ -297,29 +265,23 @@ posix_acl_default_exists(struct inode *inode) * No need for i_mutex because the inode is not yet exposed to the VFS. */ int -xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl) +xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) { - struct posix_acl *clone; - mode_t mode; + mode_t mode = inode->i_mode; int error = 0, inherit = 0; if (S_ISDIR(inode->i_mode)) { - error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl); + error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl); if (error) - return error; + goto out; } - clone = posix_acl_clone(default_acl, GFP_KERNEL); - if (!clone) - return -ENOMEM; - - mode = inode->i_mode; - error = posix_acl_create_masq(clone, &mode); + error = posix_acl_create(&acl, GFP_KERNEL, &mode); if (error < 0) - goto out_release_clone; + return error; /* - * If posix_acl_create_masq returns a positive value we need to + * If posix_acl_create returns a positive value we need to * inherit a permission that can't be represented using the Unix * mode bits and we actually need to set an ACL. */ @@ -328,20 +290,20 @@ xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl) error = xfs_set_mode(inode, mode); if (error) - goto out_release_clone; + goto out; if (inherit) - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone); + error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - out_release_clone: - posix_acl_release(clone); +out: + posix_acl_release(acl); return error; } int xfs_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; + struct posix_acl *acl; int error; if (S_ISLNK(inode->i_mode)) @@ -351,16 +313,12 @@ xfs_acl_chmod(struct inode *inode) if (IS_ERR(acl) || !acl) return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_KERNEL); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone); + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; - posix_acl_release(clone); + error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); + posix_acl_release(acl); return error; } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index b2b41198559..d1fe74506c4 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1224,6 +1224,9 @@ _xfs_buf_ioapply( rw = READ; } + /* we only use the buffer cache for meta-data */ + rw |= REQ_META; + next_chunk: atomic_inc(&bp->b_io_remaining); nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index cca00f49e09..7f7b42469ea 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -149,7 +149,9 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); xfs_ioend_wait(ip); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); if (mp->m_flags & XFS_MOUNT_BARRIER) { /* @@ -881,11 +883,14 @@ xfs_file_aio_write( /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; + int error; xfs_rw_iunlock(ip, iolock); - ret = -xfs_file_fsync(file, pos, end, + error = xfs_file_fsync(file, pos, end, (file->f_flags & __O_SYNC) ? 0 : 1); xfs_rw_ilock(ip, iolock); + if (error) + ret = error; } out_unlock: diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index acca2c5ca3f..f7ce7debe14 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -265,7 +265,7 @@ xfs_open_by_handle( return PTR_ERR(filp); } - if (inode->i_mode & S_IFREG) { + if (S_ISREG(inode->i_mode)) { filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } @@ -850,14 +850,14 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_NODEFRAG; if (xflags & XFS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (xflags & XFS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; if (xflags & XFS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + } else if (S_ISREG(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; if (xflags & XFS_XFLAG_EXTSIZE) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 501e4f63054..b9c172b3fbb 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -202,9 +202,9 @@ xfs_vn_mknod( if (default_acl) { error = -xfs_inherit_acl(inode, default_acl); + default_acl = NULL; if (unlikely(error)) goto out_cleanup_inode; - posix_acl_release(default_acl); } @@ -1022,7 +1022,7 @@ xfs_vn_fiemap( } static const struct inode_operations xfs_inode_operations = { - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1048,7 +1048,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1073,7 +1073,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1086,7 +1086,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, - .check_acl = xfs_check_acl, + .get_acl = xfs_get_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1194,6 +1194,15 @@ xfs_setup_inode( break; } + /* + * If there is no attribute fork no ACL can exist on this inode, + * and it can't have any file capabilities attached to it either. + */ + if (!XFS_IFORK_Q(ip)) { + inode_has_no_xattr(inode); + cache_no_acl(inode); + } + xfs_iflags_clear(ip, XFS_INEW); barrier(); diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index fda0708ef2e..690fc7a7bd7 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -571,7 +571,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL -DEFINE_INODE_EVENT(xfs_check_acl); +DEFINE_INODE_EVENT(xfs_get_acl); #endif DEFINE_INODE_EVENT(xfs_vm_bmap); DEFINE_INODE_EVENT(xfs_file_ioctl); diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 0135e2a669d..39632d94135 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -42,7 +42,6 @@ struct xfs_acl { #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) #ifdef CONFIG_XFS_POSIX_ACL -extern int xfs_check_acl(struct inode *inode, int mask); extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); extern int xfs_acl_chmod(struct inode *inode); @@ -52,8 +51,10 @@ extern int posix_acl_default_exists(struct inode *inode); extern const struct xattr_handler xfs_xattr_acl_access_handler; extern const struct xattr_handler xfs_xattr_acl_default_handler; #else -# define xfs_check_acl NULL -# define xfs_get_acl(inode, type) NULL +static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) +{ + return NULL; +} # define xfs_inherit_acl(inode, default_acl) 0 # define xfs_acl_chmod(inode) 0 # define posix_acl_access_exists(inode) 0 diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c51a3f90363..ab3e5c6c464 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -414,7 +414,7 @@ xfs_bmap_add_attrfork_local( if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(ip->i_d.di_mode)) { mp = ip->i_mount; memset(&dargs, 0, sizeof(dargs)); dargs.dp = ip; @@ -3344,8 +3344,7 @@ xfs_bmap_local_to_extents( * We don't want to deal with the case of keeping inode data inline yet. * So sending the data fork of a regular inode is invalid. */ - ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG && - whichfork == XFS_DATA_FORK)); + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); flags = 0; @@ -4052,7 +4051,7 @@ xfs_bmap_one_block( #ifndef DEBUG if (whichfork == XFS_DATA_FORK) { - return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ? + return S_ISREG(ip->i_d.di_mode) ? (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 2925726529f..5bfcb8779f9 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -692,6 +692,24 @@ xfs_da_join(xfs_da_state_t *state) return(error); } +#ifdef DEBUG +static void +xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) +{ + __be16 magic = blkinfo->magic; + + if (level == 1) { + ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + } else + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + ASSERT(!blkinfo->forw); + ASSERT(!blkinfo->back); +} +#else /* !DEBUG */ +#define xfs_da_blkinfo_onlychild_validate(blkinfo, level) +#endif /* !DEBUG */ + /* * We have only one entry in the root. Copy the only remaining child of * the old root to block 0 as the new root node. @@ -700,8 +718,6 @@ STATIC int xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) { xfs_da_intnode_t *oldroot; - /* REFERENCED */ - xfs_da_blkinfo_t *blkinfo; xfs_da_args_t *args; xfs_dablk_t child; xfs_dabuf_t *bp; @@ -732,15 +748,9 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) if (error) return(error); ASSERT(bp != NULL); - blkinfo = bp->data; - if (be16_to_cpu(oldroot->hdr.level) == 1) { - ASSERT(blkinfo->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - blkinfo->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - } else { - ASSERT(blkinfo->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - } - ASSERT(!blkinfo->forw); - ASSERT(!blkinfo->back); + xfs_da_blkinfo_onlychild_validate(bp->data, + be16_to_cpu(oldroot->hdr.level)); + memcpy(root_blk->bp->data, bp->data, state->blocksize); xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 4580ce00aeb..a2e27010c7f 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -121,7 +121,7 @@ xfs_dir_isempty( { xfs_dir2_sf_hdr_t *sfp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if (dp->i_d.di_size == 0) /* might happen during shutdown. */ return 1; if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) @@ -179,7 +179,7 @@ xfs_dir_init( memset((char *)&args, 0, sizeof(args)); args.dp = dp; args.trans = tp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) return error; return xfs_dir2_sf_create(&args, pdp->i_ino); @@ -202,7 +202,7 @@ xfs_dir_createname( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; XFS_STATS_INC(xs_dir_create); @@ -278,7 +278,7 @@ xfs_dir_lookup( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); memset(&args, 0, sizeof(xfs_da_args_t)); @@ -333,7 +333,7 @@ xfs_dir_removename( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_remove); memset(&args, 0, sizeof(xfs_da_args_t)); @@ -382,7 +382,7 @@ xfs_readdir( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_getdents); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) @@ -414,7 +414,7 @@ xfs_dir_replace( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; @@ -464,7 +464,7 @@ xfs_dir_canenter( if (resblks) return 0; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 084b3247d63..0179a41d9e5 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1564,7 +1564,7 @@ xfs_dir2_node_addname_int( if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { xfs_alert(mp, - "%s: dir ino " "%llu needed freesp block %lld for\n" + "%s: dir ino %llu needed freesp block %lld for\n" " data block %lld, got %lld ifbno %llu lastfbno %d", __func__, (unsigned long long)dp->i_ino, (long long)xfs_dir2_db_to_fdb(mp, dbno), diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 9124425b7f2..3ff3d9e23de 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -344,9 +344,9 @@ _xfs_filestream_update_ag( * Either ip is a regular file and pip is a directory, or ip is a * directory and pip is NULL. */ - ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip && - (pip->i_d.di_mode & S_IFDIR)) || - ((ip->i_d.di_mode & S_IFDIR) && !pip))); + ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip && + S_ISDIR(pip->i_d.di_mode)) || + (S_ISDIR(ip->i_d.di_mode) && !pip))); mp = ip->i_mount; cache = mp->m_filestream; @@ -537,7 +537,7 @@ xfs_filestream_lookup_ag( xfs_agnumber_t ag; int ref; - if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) { + if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) { ASSERT(0); return NULLAGNUMBER; } @@ -579,9 +579,9 @@ xfs_filestream_associate( xfs_agnumber_t ag, rotorstep, startag; int err = 0; - ASSERT(pip->i_d.di_mode & S_IFDIR); - ASSERT(ip->i_d.di_mode & S_IFREG); - if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) + ASSERT(S_ISDIR(pip->i_d.di_mode)); + ASSERT(S_ISREG(ip->i_d.di_mode)); + if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode)) return -EINVAL; mp = pip->i_mount; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3cc21ddf9f7..2fcca4b03ed 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -368,7 +368,7 @@ xfs_iformat( /* * no local regular files yet */ - if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { xfs_warn(ip->i_mount, "corrupt inode %Lu (local format for regular file).", (unsigned long long) ip->i_ino); @@ -1040,7 +1040,7 @@ xfs_ialloc( if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; - if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { + if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { ip->i_d.di_mode |= S_ISGID; } } @@ -1097,14 +1097,14 @@ xfs_ialloc( if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; - if ((mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_d.di_extsize = pip->i_d.di_extsize; } - } else if ((mode & S_IFMT) == S_IFREG) { + } else if (S_ISREG(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_REALTIME; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { @@ -1188,7 +1188,7 @@ xfs_isize_check( int nimaps; xfs_bmbt_irec_t imaps[2]; - if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) + if (!S_ISREG(ip->i_d.di_mode)) return; if (XFS_IS_REALTIME_INODE(ip)) @@ -1828,7 +1828,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || - ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); + (!S_ISREG(ip->i_d.di_mode))); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -2671,7 +2671,7 @@ xfs_iflush_int( __func__, ip->i_ino, ip, ip->i_d.di_magic); goto corrupt_out; } - if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (S_ISREG(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), @@ -2681,7 +2681,7 @@ xfs_iflush_int( __func__, ip->i_ino, ip); goto corrupt_out; } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + } else if (S_ISDIR(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a97644ab945..2380a4bcbec 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -263,7 +263,7 @@ typedef struct xfs_inode { struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; -#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ +#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ (ip)->i_size : (ip)->i_d.di_size; /* Convert from vfs inode to xfs inode */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8fe4206de05..052a2c0ec5f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2283,7 +2283,7 @@ xlog_recover_inode_pass2( /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; - if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", @@ -2296,7 +2296,7 @@ xlog_recover_inode_pass2( error = EFSCORRUPTED; goto error; } - } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { + } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7f25245da28..092e16ae4d9 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1331,7 +1331,7 @@ xfs_mountfs( ASSERT(rip != NULL); - if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { + if (unlikely(!S_ISDIR(rip->i_d.di_mode))) { xfs_warn(mp, "corrupted root inode %llu: not a directory", (unsigned long long)rip->i_ino); xfs_iunlock(rip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 77a59891734..df78c297d1a 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -116,7 +116,7 @@ xfs_rename( trace_xfs_rename(src_dp, target_dp, src_name, target_name); new_parent = (src_dp != target_dp); - src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); + src_is_directory = S_ISDIR(src_ip->i_d.di_mode); if (src_is_directory) { /* @@ -226,7 +226,7 @@ xfs_rename( * target and source are directories and that target can be * destroyed, or that neither is a directory. */ - if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(target_ip->i_d.di_mode)) { /* * Make sure target dir is empty. */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 88d121486c5..9322e13f0c6 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -121,7 +121,7 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); + ASSERT(S_ISLNK(ip->i_d.di_mode)); ASSERT(ip->i_d.di_size <= MAXPATHLEN); pathlen = ip->i_d.di_size; @@ -529,7 +529,7 @@ xfs_release( if (ip->i_d.di_nlink == 0) return 0; - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + if ((S_ISREG(ip->i_d.di_mode) && ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && @@ -610,7 +610,7 @@ xfs_inactive( truncate = ((ip->i_d.di_nlink == 0) && ((ip->i_d.di_size != 0) || (ip->i_size != 0) || (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && - ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); + S_ISREG(ip->i_d.di_mode)); mp = ip->i_mount; @@ -621,7 +621,7 @@ xfs_inactive( goto out; if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + if ((S_ISREG(ip->i_d.di_mode) && ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS) && @@ -669,7 +669,7 @@ xfs_inactive( xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { + } else if (S_ISLNK(ip->i_d.di_mode)) { /* * If we get an error while cleaning up a |