diff options
Diffstat (limited to 'fs')
172 files changed, 3512 insertions, 2651 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 20c106f2492..1b0b1955001 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -584,11 +584,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, success: d_add(dentry, inode); - _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }", + _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", fid.vnode, fid.unique, dentry->d_inode->i_ino, - (unsigned long long)dentry->d_inode->i_version); + dentry->d_inode->i_generation); return NULL; } @@ -671,10 +671,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) * been deleted and replaced, and the original vnode ID has * been reused */ if (fid.unique != vnode->fid.unique) { - _debug("%s: file deleted (uq %u -> %u I:%llu)", + _debug("%s: file deleted (uq %u -> %u I:%u)", dentry->d_name.name, fid.unique, vnode->fid.unique, - (unsigned long long)dentry->d_inode->i_version); + dentry->d_inode->i_generation); spin_lock(&vnode->lock); set_bit(AFS_VNODE_DELETED, &vnode->flags); spin_unlock(&vnode->lock); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 4bd0218473a..346e3289abd 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -89,7 +89,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, i_size_write(&vnode->vfs_inode, size); vnode->vfs_inode.i_uid = status->owner; vnode->vfs_inode.i_gid = status->group; - vnode->vfs_inode.i_version = vnode->fid.unique; + vnode->vfs_inode.i_generation = vnode->fid.unique; vnode->vfs_inode.i_nlink = status->nlink; mode = vnode->vfs_inode.i_mode; @@ -102,6 +102,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server; vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; + vnode->vfs_inode.i_version = data_version; } expected_version = status->data_version; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index db66c520147..0fdab6e03d8 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -75,7 +75,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; - inode->i_version = vnode->fid.unique; + inode->i_generation = vnode->fid.unique; + inode->i_version = vnode->status.data_version; inode->i_mapping->a_ops = &afs_fs_aops; /* check to see whether a symbolic link is really a mountpoint */ @@ -100,7 +101,7 @@ static int afs_iget5_test(struct inode *inode, void *opaque) struct afs_iget_data *data = opaque; return inode->i_ino == data->fid.vnode && - inode->i_version == data->fid.unique; + inode->i_generation == data->fid.unique; } /* @@ -122,7 +123,7 @@ static int afs_iget5_set(struct inode *inode, void *opaque) struct afs_vnode *vnode = AFS_FS_I(inode); inode->i_ino = data->fid.vnode; - inode->i_version = data->fid.unique; + inode->i_generation = data->fid.unique; vnode->fid = data->fid; vnode->volume = data->volume; @@ -380,8 +381,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, inode = dentry->d_inode; - _enter("{ ino=%lu v=%llu }", inode->i_ino, - (unsigned long long)inode->i_version); + _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); generic_fillattr(inode, stat); return 0; diff --git a/fs/afs/super.c b/fs/afs/super.c index fb240e8766d..356dcf0929e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -31,8 +31,8 @@ static void afs_i_init_once(void *foo); static struct dentry *afs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); +static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); -static void afs_put_super(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -40,7 +40,7 @@ struct file_system_type afs_fs_type = { .owner = THIS_MODULE, .name = "afs", .mount = afs_mount, - .kill_sb = kill_anon_super, + .kill_sb = afs_kill_super, .fs_flags = 0, }; @@ -50,7 +50,6 @@ static const struct super_operations afs_super_ops = { .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .evict_inode = afs_evict_inode, - .put_super = afs_put_super, .show_options = generic_show_options, }; @@ -282,19 +281,25 @@ static int afs_parse_device_name(struct afs_mount_params *params, */ static int afs_test_super(struct super_block *sb, void *data) { - struct afs_mount_params *params = data; + struct afs_super_info *as1 = data; struct afs_super_info *as = sb->s_fs_info; - return as->volume == params->volume; + return as->volume == as1->volume; +} + +static int afs_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); } /* * fill in the superblock */ -static int afs_fill_super(struct super_block *sb, void *data) +static int afs_fill_super(struct super_block *sb, + struct afs_mount_params *params) { - struct afs_mount_params *params = data; - struct afs_super_info *as = NULL; + struct afs_super_info *as = sb->s_fs_info; struct afs_fid fid; struct dentry *root = NULL; struct inode *inode = NULL; @@ -302,23 +307,13 @@ static int afs_fill_super(struct super_block *sb, void *data) _enter(""); - /* allocate a superblock info record */ - as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); - if (!as) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - afs_get_volume(params->volume); - as->volume = params->volume; - /* fill in the superblock */ sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; - sb->s_fs_info = as; sb->s_bdi = &as->volume->bdi; + strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); /* allocate the root inode and dentry */ fid.vid = as->volume->vid; @@ -326,7 +321,7 @@ static int afs_fill_super(struct super_block *sb, void *data) fid.unique = 1; inode = afs_iget(sb, params->key, &fid, NULL, NULL); if (IS_ERR(inode)) - goto error_inode; + return PTR_ERR(inode); if (params->autocell) set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); @@ -342,16 +337,8 @@ static int afs_fill_super(struct super_block *sb, void *data) _leave(" = 0"); return 0; -error_inode: - ret = PTR_ERR(inode); - inode = NULL; error: iput(inode); - afs_put_volume(as->volume); - kfree(as); - - sb->s_fs_info = NULL; - _leave(" = %d", ret); return ret; } @@ -367,6 +354,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, struct afs_volume *vol; struct key *key; char *new_opts = kstrdup(options, GFP_KERNEL); + struct afs_super_info *as; int ret; _enter(",,%s,%p", dev_name, options); @@ -399,12 +387,22 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, ret = PTR_ERR(vol); goto error; } - params.volume = vol; + + /* allocate a superblock info record */ + as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); + if (!as) { + ret = -ENOMEM; + afs_put_volume(vol); + goto error; + } + as->volume = vol; /* allocate a deviceless superblock */ - sb = sget(fs_type, afs_test_super, set_anon_super, ¶ms); + sb = sget(fs_type, afs_test_super, afs_set_super, as); if (IS_ERR(sb)) { ret = PTR_ERR(sb); + afs_put_volume(vol); + kfree(as); goto error; } @@ -422,16 +420,16 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, } else { _debug("reuse"); ASSERTCMP(sb->s_flags, &, MS_ACTIVE); + afs_put_volume(vol); + kfree(as); } - afs_put_volume(params.volume); afs_put_cell(params.cell); kfree(new_opts); _leave(" = 0 [%p]", sb); return dget(sb->s_root); error: - afs_put_volume(params.volume); afs_put_cell(params.cell); key_put(params.key); kfree(new_opts); @@ -439,18 +437,12 @@ error: return ERR_PTR(ret); } -/* - * finish the unmounting process on the superblock - */ -static void afs_put_super(struct super_block *sb) +static void afs_kill_super(struct super_block *sb) { struct afs_super_info *as = sb->s_fs_info; - - _enter(""); - + kill_anon_super(sb); afs_put_volume(as->volume); - - _leave(""); + kfree(as); } /* diff --git a/fs/afs/write.c b/fs/afs/write.c index 789b3afb342..b806285ff85 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -84,23 +84,21 @@ void afs_put_writeback(struct afs_writeback *wb) * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, unsigned len, struct page *page) + loff_t pos, struct page *page) { loff_t i_size; - unsigned eof; int ret; + int len; - _enter(",,%llu,%u", (unsigned long long)pos, len); - - ASSERTCMP(len, <=, PAGE_CACHE_SIZE); + _enter(",,%llu", (unsigned long long)pos); i_size = i_size_read(&vnode->vfs_inode); - if (pos + len > i_size) - eof = i_size; + if (pos + PAGE_CACHE_SIZE > i_size) + len = i_size - pos; else - eof = PAGE_CACHE_SIZE; + len = PAGE_CACHE_SIZE; - ret = afs_vnode_fetch_data(vnode, key, 0, eof, page); + ret = afs_vnode_fetch_data(vnode, key, pos, len, page); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -153,9 +151,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping, *pagep = page; /* page won't leak in error case: it eventually gets cleaned off LRU */ - if (!PageUptodate(page)) { - _debug("not up to date"); - ret = afs_fill_page(vnode, key, pos, len, page); + if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { + ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9ad2369d9e3..bfcb18feb1d 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -231,9 +231,6 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return -EIO; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 63039ed9576..2bc5dc644b4 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1864,6 +1864,7 @@ cleanup: kfree(psinfo); kfree(notes); kfree(fpu); + kfree(shdr4extnum); #ifdef ELF_CORE_COPY_XFPREGS kfree(xfpu); #endif diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a2421f908f..610e8e0b04b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -762,7 +762,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, if (!disk) return ERR_PTR(-ENXIO); - whole = bdget_disk(disk, 0); + /* + * Normally, @bdev should equal what's returned from bdget_disk() + * if partno is 0; however, some drivers (floppy) use multiple + * bdev's for the same physical device and @bdev may be one of the + * aliases. Keep @bdev if partno is 0. This means claimer + * tracking is broken for those devices but it has always been that + * way. + */ + if (partno) + whole = bdget_disk(disk, 0); + else + whole = bdgrab(bdev); + module_put(disk->fops->owner); put_disk(disk); if (!whole) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d84089349c8..2e667868e0d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1228,6 +1228,7 @@ static void reada_for_search(struct btrfs_root *root, u32 nr; u32 blocksize; u32 nscan = 0; + bool map = true; if (level != 1) return; @@ -1249,8 +1250,11 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; + if (node->map_token || path->skip_locking) + map = false; + while (1) { - if (!node->map_token) { + if (map && !node->map_token) { unsigned long offset = btrfs_node_key_ptr_offset(nr); map_private_extent_buffer(node, offset, sizeof(struct btrfs_key_ptr), @@ -1277,7 +1281,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; @@ -1289,7 +1293,7 @@ static void reada_for_search(struct btrfs_root *root, if ((nread > 65536 || nscan > 32)) break; } - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 378b5b4443f..3b859a3e6a0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -19,7 +19,6 @@ #ifndef __BTRFS_CTREE__ #define __BTRFS_CTREE__ -#include <linux/version.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/fs.h> @@ -967,6 +966,12 @@ struct btrfs_fs_info { struct srcu_struct subvol_srcu; spinlock_t trans_lock; + /* + * the reloc mutex goes with the trans lock, it is taken + * during commit to protect us from the relocation code + */ + struct mutex reloc_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -1172,6 +1177,14 @@ struct btrfs_root { u32 type; u64 highest_objectid; + + /* btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So in_trans_setup + * is used to tell us when more checks are required + */ + unsigned long in_trans_setup; int ref_cows; int track_dirty; int in_radix; @@ -1181,7 +1194,6 @@ struct btrfs_root { struct btrfs_key defrag_max; int defrag_running; char *name; - int in_sysfs; /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; @@ -1323,6 +1335,11 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_STRING_ITEM_KEY 253 +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ #define BTRFS_MOUNT_NODATASUM (1 << 0) #define BTRFS_MOUNT_NODATACOW (1 << 1) #define BTRFS_MOUNT_NOBARRIER (1 << 2) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6462c29d2d3..98c68e658a9 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -82,19 +82,16 @@ static inline struct btrfs_delayed_root *btrfs_get_delayed_root( return root->fs_info->delayed_root; } -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct inode *inode) +static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) { - struct btrfs_delayed_node *node; struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(inode); - int ret; + struct btrfs_delayed_node *node; -again: node = ACCESS_ONCE(btrfs_inode->delayed_node); if (node) { - atomic_inc(&node->refs); /* can be accessed */ + atomic_inc(&node->refs); return node; } @@ -102,8 +99,10 @@ again: node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { + atomic_inc(&node->refs); /* can be accessed */ + BUG_ON(btrfs_inode->delayed_node != node); spin_unlock(&root->inode_lock); - goto again; + return node; } btrfs_inode->delayed_node = node; atomic_inc(&node->refs); /* can be accessed */ @@ -113,6 +112,23 @@ again: } spin_unlock(&root->inode_lock); + return NULL; +} + +static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( + struct inode *inode) +{ + struct btrfs_delayed_node *node; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + int ret; + +again: + node = btrfs_get_delayed_node(inode); + if (node) + return node; + node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); if (!node) return ERR_PTR(-ENOMEM); @@ -297,7 +313,6 @@ struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->data_len = data_len; item->ins_or_del = 0; item->bytes_reserved = 0; - item->block_rsv = NULL; item->delayed_node = NULL; atomic_set(&item->refs, 1); } @@ -549,19 +564,6 @@ struct btrfs_delayed_item *__btrfs_next_delayed_item( return next; } -static inline struct btrfs_delayed_node *btrfs_get_delayed_node( - struct inode *inode) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - struct btrfs_delayed_node *delayed_node; - - delayed_node = btrfs_inode->delayed_node; - if (delayed_node) - atomic_inc(&delayed_node->refs); - - return delayed_node; -} - static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root, u64 root_id) { @@ -593,10 +595,8 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) { + if (!ret) item->bytes_reserved = num_bytes; - item->block_rsv = dst_rsv; - } return ret; } @@ -604,10 +604,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, struct btrfs_delayed_item *item) { + struct btrfs_block_rsv *rsv; + if (!item->bytes_reserved) return; - btrfs_block_rsv_release(root, item->block_rsv, + rsv = &root->fs_info->global_block_rsv; + btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -1014,6 +1017,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret = 0; path = btrfs_alloc_path(); @@ -1021,6 +1025,9 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + delayed_root = btrfs_get_delayed_root(root); curr_node = btrfs_first_delayed_node(delayed_root); @@ -1045,6 +1052,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, } btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1052,6 +1060,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret; path = btrfs_alloc_path(); @@ -1059,6 +1068,9 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &node->root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, node->root, node); @@ -1066,6 +1078,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = btrfs_update_delayed_inode(trans, node->root, path, node); btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1116,6 +1129,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1134,6 +1148,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) if (IS_ERR(trans)) goto free_path; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, root, @@ -1176,6 +1193,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) nr = trans->blocks_used; + trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); __btrfs_btree_balance_dirty(root, nr); free_path: @@ -1222,6 +1240,13 @@ again: return 0; } +void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + delayed_root = btrfs_get_delayed_root(root); + WARN_ON(btrfs_first_delayed_node(delayed_root)); +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; @@ -1382,8 +1407,7 @@ end: int btrfs_inode_delayed_dir_index_count(struct inode *inode) { - struct btrfs_delayed_node *delayed_node = BTRFS_I(inode)->delayed_node; - int ret = 0; + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1393,11 +1417,14 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode) * a new directory index is added into the delayed node and index_cnt * is updated now. So we needn't lock the delayed node. */ - if (!delayed_node->index_cnt) + if (!delayed_node->index_cnt) { + btrfs_release_delayed_node(delayed_node); return -EINVAL; + } BTRFS_I(inode)->index_cnt = delayed_node->index_cnt; - return ret; + btrfs_release_delayed_node(delayed_node); + return 0; } void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, @@ -1591,6 +1618,57 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode->i_ctime.tv_nsec); } +int btrfs_fill_inode(struct inode *inode, u32 *rdev) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return -ENOENT; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->inode_dirty) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return -ENOENT; + } + + inode_item = &delayed_node->inode_item; + + inode->i_uid = btrfs_stack_inode_uid(inode_item); + inode->i_gid = btrfs_stack_inode_gid(inode_item); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + inode->i_mode = btrfs_stack_inode_mode(inode_item); + inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); + BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); + inode->i_rdev = 0; + *rdev = btrfs_stack_inode_rdev(inode_item); + BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + inode->i_generation = BTRFS_I(inode)->generation; + BTRFS_I(inode)->index_cnt = (u64)-1; + + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index eb7d240aa64..8d27af4bd8b 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -75,7 +75,6 @@ struct btrfs_delayed_item { struct list_head tree_list; /* used for batch insert/delete items */ struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; - struct btrfs_block_rsv *block_rsv; struct btrfs_delayed_node *delayed_node; atomic_t refs; int ins_or_del; @@ -120,6 +119,7 @@ void btrfs_kill_delayed_inode_items(struct inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_fill_inode(struct inode *inode, u32 *rdev); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); @@ -138,4 +138,8 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, /* for init */ int __init btrfs_delayed_inode_init(void); void btrfs_delayed_inode_exit(void); + +/* for debugging */ +void btrfs_assert_delayed_root_empty(struct btrfs_root *root); + #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a203d363184..1ac8db5dc0a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1044,7 +1044,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_trans = 0; root->highest_objectid = 0; root->name = NULL; - root->in_sysfs = 0; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; @@ -1300,19 +1299,21 @@ again: return root; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - if (!root->free_ino_ctl) - goto fail; root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), GFP_NOFS); - if (!root->free_ino_pinned) + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; goto fail; + } btrfs_init_free_ino_ctl(root); mutex_init(&root->fs_commit_mutex); spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); - set_anon_super(&root->anon_super, NULL); + ret = set_anon_super(&root->anon_super, NULL); + if (ret) + goto fail; if (btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; @@ -1618,6 +1619,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; @@ -1668,8 +1670,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->scrub_pause_wait); init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, &fs_info->generic_worker); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2911,9 +2911,8 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) INIT_LIST_HEAD(&splice); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); - spin_lock(&root->fs_info->delalloc_lock); + list_splice_init(&root->fs_info->delalloc_inodes, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5b9b6b6df24..71cd456fdb6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3089,6 +3089,13 @@ alloc: } goto again; } + + /* + * If we have less pinned bytes than we want to allocate then + * don't bother committing the transaction, it won't help us. + */ + if (data_sinfo->bytes_pinned < bytes) + committed = 1; spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ @@ -3307,10 +3314,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; - /* nothing to shrink - nothing to reclaim */ - if (root->fs_info->delalloc_bytes == 0) - return 0; - max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { @@ -4839,7 +4842,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, u64 num_bytes, u64 empty_size, u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, - int data) + u64 data) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -4866,7 +4869,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, space_info = __find_space_info(root->fs_info, data); if (!space_info) { - printk(KERN_ERR "No space info for %d\n", data); + printk(KERN_ERR "No space info for %llu\n", data); return -ENOSPC; } @@ -5211,9 +5214,7 @@ loop: * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && - (found_uncached_bg || empty_size || empty_cluster || - allowed_chunk_alloc)) { + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; @@ -5253,32 +5254,36 @@ loop: goto search; } - if (loop < LOOP_CACHING_WAIT) { - loop++; - goto search; - } + loop++; if (loop == LOOP_ALLOC_CHUNK) { - empty_size = 0; - empty_cluster = 0; - } + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, + CHUNK_ALLOC_LIMITED); + allowed_chunk_alloc = 0; + if (ret == 1) + done_chunk_alloc = 1; + } else if (!done_chunk_alloc && + space_info->force_alloc == + CHUNK_ALLOC_NO_FORCE) { + space_info->force_alloc = CHUNK_ALLOC_LIMITED; + } - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, - CHUNK_ALLOC_LIMITED); - allowed_chunk_alloc = 0; - done_chunk_alloc = 1; - } else if (!done_chunk_alloc && - space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) { - space_info->force_alloc = CHUNK_ALLOC_LIMITED; + /* + * We didn't allocate a chunk, go ahead and drop the + * empty size and loop again. + */ + if (!done_chunk_alloc) + loop = LOOP_NO_EMPTY_SIZE; } - if (loop < LOOP_NO_EMPTY_SIZE) { - loop++; - goto search; + if (loop == LOOP_NO_EMPTY_SIZE) { + empty_size = 0; + empty_cluster = 0; } - ret = -ENOSPC; + + goto search; } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4e8445a4757..a11a92ee2d3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -126,9 +126,9 @@ struct extent_buffer { unsigned long map_len; struct page *first_page; unsigned long bflags; - atomic_t refs; struct list_head leak_list; struct rcu_head rcu_head; + atomic_t refs; /* the spinlock is used to protect most operations */ spinlock_t lock; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ad144736a5f..bf0d61567f3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -250,7 +250,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, pgoff_t index = 0; unsigned long first_page_offset; int num_checksums; - int ret = 0, ret2; + int ret = 0; INIT_LIST_HEAD(&bitmaps); @@ -421,11 +421,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } spin_lock(&ctl->tree_lock); - ret2 = link_free_space(ctl, e); + ret = link_free_space(ctl, e); ctl->total_bitmaps++; ctl->op->recalc_thresholds(ctl); spin_unlock(&ctl->tree_lock); - list_add_tail(&e->list, &bitmaps); if (ret) { printk(KERN_ERR "Duplicate entries in " "free space cache, dumping\n"); @@ -434,6 +433,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, page_cache_release(page); goto free_cache; } + list_add_tail(&e->list, &bitmaps); } num_entries--; @@ -1417,6 +1417,23 @@ again: return 0; } +static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + u64 bytes_to_set = 0; + u64 end; + + end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); + + bytes_to_set = min(end - offset, bytes); + + bitmap_set_bits(ctl, info, offset, bytes_to_set); + + return bytes_to_set; + +} + static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { @@ -1453,12 +1470,18 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } +static struct btrfs_free_space_op free_space_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_free_space *bitmap_info; + struct btrfs_block_group_cache *block_group = NULL; int added = 0; - u64 bytes, offset, end; + u64 bytes, offset, bytes_added; int ret; bytes = info->bytes; @@ -1467,7 +1490,49 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, if (!ctl->op->use_bitmap(ctl, info)) return 0; + if (ctl->op == &free_space_op) + block_group = ctl->private; again: + /* + * Since we link bitmaps right into the cluster we need to see if we + * have a cluster here, and if so and it has our bitmap we need to add + * the free space to that bitmap. + */ + if (block_group && !list_empty(&block_group->cluster_list)) { + struct btrfs_free_cluster *cluster; + struct rb_node *node; + struct btrfs_free_space *entry; + + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + spin_lock(&cluster->lock); + node = rb_first(&cluster->root); + if (!node) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (!entry->bitmap) { + spin_unlock(&cluster->lock); + goto no_cluster_bitmap; + } + + if (entry->offset == offset_to_bitmap(ctl, offset)) { + bytes_added = add_bytes_to_bitmap(ctl, entry, + offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + } + spin_unlock(&cluster->lock); + if (!bytes) { + ret = 1; + goto out; + } + } + +no_cluster_bitmap: bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { @@ -1475,19 +1540,10 @@ again: goto new_bitmap; } - end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); - - if (offset >= bitmap_info->offset && offset + bytes > end) { - bitmap_set_bits(ctl, bitmap_info, offset, end - offset); - bytes -= end - offset; - offset = end; - added = 0; - } else if (offset >= bitmap_info->offset && offset + bytes <= end) { - bitmap_set_bits(ctl, bitmap_info, offset, bytes); - bytes = 0; - } else { - BUG(); - } + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + added = 0; if (!bytes) { ret = 1; @@ -1766,11 +1822,6 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, "\n", count); } -static struct btrfs_free_space_op free_space_op = { - .recalc_thresholds = recalculate_thresholds, - .use_bitmap = use_bitmap, -}; - void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -1842,9 +1893,12 @@ void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl) while ((node = rb_last(&ctl->free_space_offset)) != NULL) { info = rb_entry(node, struct btrfs_free_space, offset_index); - unlink_free_space(ctl, info); - kfree(info->bitmap); - kmem_cache_free(btrfs_free_space_cachep, info); + if (!info->bitmap) { + unlink_free_space(ctl, info); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } if (need_resched()) { spin_unlock(&ctl->tree_lock); cond_resched(); @@ -2142,9 +2196,11 @@ again: /* * This searches the block group for just extents to fill the cluster with. */ -static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 min_bytes) +static noinline int +setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; @@ -2166,6 +2222,8 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * extent entry. */ while (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) return -ENOSPC; @@ -2185,8 +2243,12 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, return -ENOSPC; entry = rb_entry(node, struct btrfs_free_space, offset_index); - if (entry->bitmap) + if (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); continue; + } + /* * we haven't filled the empty size and the window is * very large. reset and try again @@ -2238,9 +2300,11 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * This specifically looks for bitmaps that may work in the cluster, we assume * that we have already failed to find extents that will work. */ -static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 min_bytes) +static noinline int +setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; @@ -2250,10 +2314,39 @@ static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, if (ctl->total_bitmaps == 0) return -ENOSPC; + /* + * First check our cached list of bitmaps and see if there is an entry + * here that will work. + */ + list_for_each_entry(entry, bitmaps, list) { + if (entry->bytes < min_bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, min_bytes); + if (!ret) + return 0; + } + + /* + * If we do have entries on our list and we are here then we didn't find + * anything, so go ahead and get the next entry after the last entry in + * this list and start the search from there. + */ + if (!list_empty(bitmaps)) { + entry = list_entry(bitmaps->prev, struct btrfs_free_space, + list); + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); + goto search; + } + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); if (!entry) return -ENOSPC; +search: node = &entry->offset_index; do { entry = rb_entry(node, struct btrfs_free_space, offset_index); @@ -2284,6 +2377,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct list_head bitmaps; + struct btrfs_free_space *entry, *tmp; u64 min_bytes; int ret; @@ -2322,11 +2417,16 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes, - min_bytes); + INIT_LIST_HEAD(&bitmaps); + ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, + bytes, min_bytes); if (ret) - ret = setup_cluster_bitmap(block_group, cluster, offset, - bytes, min_bytes); + ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, + offset, bytes, min_bytes); + + /* Clear our temporary list */ + list_for_each_entry_safe(entry, tmp, &bitmaps, list) + list_del_init(&entry->list); if (!ret) { atomic_inc(&block_group->count); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ebf95f7a44d..3601f0aebdd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1986,7 +1986,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, } if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - return 0; + goto good; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { @@ -2509,6 +2509,11 @@ static void btrfs_read_locked_inode(struct inode *inode) int maybe_acls; u32 rdev; int ret; + bool filled = false; + + ret = btrfs_fill_inode(inode, &rdev); + if (!ret) + filled = true; path = btrfs_alloc_path(); BUG_ON(!path); @@ -2520,6 +2525,10 @@ static void btrfs_read_locked_inode(struct inode *inode) goto make_bad; leaf = path->nodes[0]; + + if (filled) + goto cache_acl; + inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); if (!leaf->map_token) @@ -2556,7 +2565,7 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - +cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls @@ -2572,7 +2581,6 @@ static void btrfs_read_locked_inode(struct inode *inode) } btrfs_free_path(path); - inode_item = NULL; switch (inode->i_mode & S_IFMT) { case S_IFREG: @@ -2670,12 +2678,14 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, int ret; /* - * If root is tree root, it means this inode is used to - * store free space information. And these inodes are updated - * when committing the transaction, so they needn't delaye to - * be updated, or deadlock will occured. + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay */ - if (!is_free_space_inode(root, inode)) { + if (!is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); @@ -3076,6 +3086,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); + btrfs_free_path(path); return 0; } @@ -3646,7 +3657,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_i_size_write(inode, 0); while (1) { - trans = btrfs_start_transaction(root, 0); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = root->orphan_block_rsv; @@ -4519,6 +4530,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_tree_add(inode); trace_btrfs_inode_new(inode); + btrfs_set_inode_last_trans(trans, inode); return inode; fail: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ac37040e426..a3c4751e07d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -482,8 +482,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); + spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); + spin_unlock(&root->fs_info->trans_lock); if (async_transid) { *async_transid = trans->transid; ret = btrfs_commit_transaction_async(trans, @@ -2054,29 +2056,34 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { - struct btrfs_ioctl_fs_info_args fi_args; + struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_device *next; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - fi_args.num_devices = fs_devices->num_devices; - fi_args.max_id = 0; - memcpy(&fi_args.fsid, root->fs_info->fsid, sizeof(fi_args.fsid)); + fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); + if (!fi_args) + return -ENOMEM; + + fi_args->num_devices = fs_devices->num_devices; + memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->devid > fi_args.max_id) - fi_args.max_id = device->devid; + if (device->devid > fi_args->max_id) + fi_args->max_id = device->devid; } mutex_unlock(&fs_devices->device_list_mutex); - if (copy_to_user(arg, &fi_args, sizeof(fi_args))) - return -EFAULT; + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) + ret = -EFAULT; - return 0; + kfree(fi_args); + return ret; } static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b1ef27cc673..5e0a3dc79a4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1368,7 +1368,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int ret; if (!root->reloc_root) - return 0; + goto out; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -1390,6 +1390,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, root->fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); + +out: return 0; } @@ -2142,10 +2144,11 @@ int prepare_to_merge(struct reloc_control *rc, int err) u64 num_bytes = 0; int ret; - spin_lock(&root->fs_info->trans_lock); + mutex_lock(&root->fs_info->reloc_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); + again: if (!err) { num_bytes = rc->merging_rsv_size; @@ -2214,9 +2217,16 @@ int merge_reloc_roots(struct reloc_control *rc) int ret; again: root = rc->extent_root; - spin_lock(&root->fs_info->trans_lock); + + /* + * this serializes us with btrfs_record_root_in_transaction, + * we have to make sure nobody is in the middle of + * adding their roots to the list while we are + * doing this splice + */ + mutex_lock(&root->fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); while (!list_empty(&reloc_roots)) { found = 1; @@ -3590,17 +3600,19 @@ next: static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = rc; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = NULL; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static int check_extent_flags(u64 flags) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index df50fd1eca8..a8d03d5efb5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -16,13 +16,7 @@ * Boston, MA 021110-1307, USA. */ -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/writeback.h> #include <linux/blkdev.h> -#include <linux/rbtree.h> -#include <linux/slab.h> -#include <linux/workqueue.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" @@ -804,18 +798,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; - - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid != logical) { - ret = btrfs_previous_item(root, path, 0, - BTRFS_EXTENT_ITEM_KEY); - if (ret < 0) - goto out; - } + goto out_noplug; + /* + * we might miss half an extent here, but that doesn't matter, + * as it's only the prefetch + */ while (1) { l = path->nodes[0]; slot = path->slots[0]; @@ -824,7 +812,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (ret == 0) continue; if (ret < 0) - goto out; + goto out_noplug; break; } @@ -906,15 +894,20 @@ again: ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid != logical) { + if (ret > 0) { ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY); if (ret < 0) goto out; + if (ret > 0) { + /* there's no smaller item, so stick with the + * larger one */ + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } } while (1) { @@ -989,6 +982,7 @@ next: out: blk_finish_plug(&plug); +out_noplug: btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -1064,8 +1058,15 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; - ret = 0; + break; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + } l = path->nodes[0]; slot = path->slots[0]; @@ -1075,7 +1076,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) if (found_key.objectid != sdev->dev->devid) break; - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) break; if (found_key.offset >= end) @@ -1104,7 +1105,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) cache = btrfs_lookup_block_group(fs_info, chunk_offset); if (!cache) { ret = -ENOENT; - goto out; + break; } ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, chunk_offset, length); @@ -1116,9 +1117,13 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) btrfs_release_path(path); } -out: btrfs_free_path(path); - return ret; + + /* + * ret can still be 1 from search_slot or next_leaf, + * that's not an error + */ + return ret < 0 ? ret : 0; } static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) @@ -1155,8 +1160,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; mutex_lock(&fs_info->scrub_lock); - if (fs_info->scrub_workers_refcnt == 0) + if (fs_info->scrub_workers_refcnt == 0) { + btrfs_init_workers(&fs_info->scrub_workers, "scrub", + fs_info->thread_pool_size, &fs_info->generic_worker); + fs_info->scrub_workers.idle_thresh = 4; btrfs_start_workers(&fs_info->scrub_workers, 1); + } ++fs_info->scrub_workers_refcnt; mutex_unlock(&fs_info->scrub_lock); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 117e74e3604..15634d4648d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -723,6 +723,12 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) + seq_puts(seq, ",enospc_debug"); + if (btrfs_test_opt(root, AUTO_DEFRAG)) + seq_puts(seq, ",autodefrag"); + if (btrfs_test_opt(root, INODE_MAP_CACHE)) + seq_puts(seq, ",inode_cache"); return 0; } @@ -825,7 +831,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; + s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c3c223ae669..daac9ae6d73 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -28,152 +28,6 @@ #include "disk-io.h" #include "transaction.h" -static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_used(&root->root_item)); -} - -static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_limit(&root->root_item)); -} - -static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) -{ - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); -} - -static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); -} - -static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); -} - -/* this is for root attrs (subvols/snapshots) */ -struct btrfs_root_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_root *, char *); - ssize_t (*store)(struct btrfs_root *, const char *, size_t); -}; - -#define ROOT_ATTR(name, mode, show, store) \ -static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ - show, store) - -ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); -ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); - -static struct attribute *btrfs_root_attrs[] = { - &btrfs_root_attr_blocks_used.attr, - &btrfs_root_attr_block_limit.attr, - NULL, -}; - -/* this is for super attrs (actual full fs) */ -struct btrfs_super_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_fs_info *, char *); - ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); -}; - -#define SUPER_ATTR(name, mode, show, store) \ -static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ - show, store) - -SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); -SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); -SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); - -static struct attribute *btrfs_super_attrs[] = { - &btrfs_super_attr_blocks_used.attr, - &btrfs_super_attr_total_blocks.attr, - &btrfs_super_attr_blocksize.attr, - NULL, -}; - -static ssize_t btrfs_super_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->show ? a->show(fs, buf) : 0; -} - -static ssize_t btrfs_super_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->store ? a->store(fs, buf, len) : 0; -} - -static ssize_t btrfs_root_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - - return a->show ? a->show(root, buf) : 0; -} - -static ssize_t btrfs_root_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - return a->store ? a->store(root, buf, len) : 0; -} - -static void btrfs_super_release(struct kobject *kobj) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - complete(&fs->kobj_unregister); -} - -static void btrfs_root_release(struct kobject *kobj) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - complete(&root->kobj_unregister); -} - -static const struct sysfs_ops btrfs_super_attr_ops = { - .show = btrfs_super_attr_show, - .store = btrfs_super_attr_store, -}; - -static const struct sysfs_ops btrfs_root_attr_ops = { - .show = btrfs_root_attr_show, - .store = btrfs_root_attr_store, -}; - /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dd719662340..51dcec86757 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -126,28 +126,85 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, +static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); + /* + * see below for in_trans_setup usage rules + * we have the reloc mutex held now, so there + * is only one writer in this function + */ + root->in_trans_setup = 1; + + /* make sure readers find in_trans_setup before + * they find our root->last_trans update + */ + smp_wmb(); + spin_lock(&root->fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } - root->last_trans = trans->transid; radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); spin_unlock(&root->fs_info->fs_roots_radix_lock); + root->last_trans = trans->transid; + + /* this is pretty tricky. We don't want to + * take the relocation lock in btrfs_record_root_in_trans + * unless we're really doing the first setup for this root in + * this transaction. + * + * Normally we'd use root->last_trans as a flag to decide + * if we want to take the expensive mutex. + * + * But, we have to set root->last_trans before we + * init the relocation root, otherwise, we trip over warnings + * in ctree.c. The solution used here is to flag ourselves + * with root->in_trans_setup. When this is 1, we're still + * fixing up the reloc trees and everyone must wait. + * + * When this is zero, they can trust root->last_trans and fly + * through btrfs_record_root_in_trans without having to take the + * lock. smp_wmb() makes sure that all the writes above are + * done before we pop in the zero below + */ btrfs_init_reloc_root(trans, root); + smp_wmb(); + root->in_trans_setup = 0; } return 0; } + +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; + + /* + * see record_root_in_trans for comments about in_trans_setup usage + * and barriers + */ + smp_rmb(); + if (root->last_trans == trans->transid && + !root->in_trans_setup) + return 0; + + mutex_lock(&root->fs_info->reloc_mutex); + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->reloc_mutex); + + return 0; +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -349,7 +406,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) list) { if (t->in_commit) { if (t->commit_done) - goto out; + break; cur_trans = t; atomic_inc(&cur_trans->use_count); break; @@ -882,7 +939,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent = dget_parent(dentry); parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; - btrfs_record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root); /* * insert the directory item @@ -900,7 +957,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - btrfs_record_root_in_trans(trans, root); + /* + * pull in the delayed directory update + * and the delayed inode item + * otherwise we corrupt the FS during + * snapshot + */ + ret = btrfs_run_delayed_items(trans, root); + BUG_ON(ret); + + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -961,14 +1027,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, int ret; list_for_each_entry(pending, head, list) { - /* - * We must deal with the delayed items before creating - * snapshots, or we will create a snapthot with inconsistent - * information. - */ - ret = btrfs_run_delayed_items(trans, fs_info->fs_root); - BUG_ON(ret); - ret = create_pending_snapshot(trans, fs_info, pending); BUG_ON(ret); } @@ -1118,8 +1176,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, wait_current_trans_commit_start_and_unblock(root, cur_trans); else wait_current_trans_commit_start(root, cur_trans); - put_transaction(cur_trans); + if (current->journal_info == trans) + current->journal_info = NULL; + + put_transaction(cur_trans); return 0; } @@ -1238,21 +1299,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, schedule_timeout(1); finish_wait(&cur_trans->writer_wait, &wait); - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 1; - spin_unlock(&root->fs_info->trans_lock); } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); - ret = create_pending_snapshots(trans, root->fs_info); - BUG_ON(ret); + /* + * Ok now we need to make sure to block out any other joins while we + * commit the transaction. We could have started a join before setting + * no_join so make sure to wait for num_writers to == 1 again. + */ + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + /* + * the reloc mutex makes sure that we stop + * the balancing code from coming in and moving + * extents around in the middle of the commit + */ + mutex_lock(&root->fs_info->reloc_mutex); ret = btrfs_run_delayed_items(trans, root); BUG_ON(ret); + ret = create_pending_snapshots(trans, root->fs_info); + BUG_ON(ret); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); + /* + * make sure none of the code above managed to slip in a + * delayed item + */ + btrfs_assert_delayed_root_empty(root); + WARN_ON(cur_trans != trans->transaction); btrfs_scrub_pause(root); @@ -1309,6 +1391,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, root->fs_info->running_transaction = NULL; root->fs_info->trans_no_join = 0; spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); wake_up(&root->fs_info->transaction_wait); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 592396c6dc4..4ce8a9f41d1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3177,7 +3177,7 @@ again: tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); - BUG_ON(!wc.replay_dest); + BUG_ON(IS_ERR_OR_NULL(wc.replay_dest)); wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index da541dfca2e..19450bc5363 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -689,12 +689,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, transid = btrfs_super_generation(disk_super); if (disk_super->label[0]) printk(KERN_INFO "device label %s ", disk_super->label); - else { - /* FIXME, make a readl uuid parser */ - printk(KERN_INFO "device fsid %llx-%llx ", - *(unsigned long long *)disk_super->fsid, - *(unsigned long long *)(disk_super->fsid + 8)); - } + else + printk(KERN_INFO "device fsid %pU ", disk_super->fsid); printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); @@ -2102,7 +2098,8 @@ int btrfs_balance(struct btrfs_root *dev_root) chunk_root->root_key.objectid, found_key.objectid, found_key.offset); - BUG_ON(ret && ret != -ENOSPC); + if (ret && ret != -ENOSPC) + goto error; key.offset = found_key.offset - 1; } ret = 0; diff --git a/fs/buffer.c b/fs/buffer.c index 49c9aada037..1a80b048ade 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1902,10 +1902,8 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(*wait_bh)) err = -EIO; } - if (unlikely(err)) { + if (unlikely(err)) page_zero_new_buffers(page, from, to); - ClearPageUptodate(page); - } return err; } EXPORT_SYMBOL(__block_write_begin); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 33da49dc3cc..5a3953db811 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -453,7 +453,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) int err; struct inode *inode = page->mapping->host; BUG_ON(!inode); - igrab(inode); + ihold(inode); err = writepage_nounlock(page, wbc); unlock_page(page); iput(inode); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1f72b00447c..f605753c8fe 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2940,14 +2940,12 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) while (!list_empty(&mdsc->cap_dirty)) { ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, i_dirty_item); - inode = igrab(&ci->vfs_inode); + inode = &ci->vfs_inode; + ihold(inode); dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); - if (inode) { - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, - NULL); - iput(inode); - } + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 33729e822bb..ef8f08c343e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -308,7 +308,8 @@ more: req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_dentry = dget(filp->f_dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; @@ -787,10 +788,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); - if (err) + if (err) { d_drop(dentry); - else if (!req->r_reply_info.head->is_dentry) - d_instantiate(dentry, igrab(old_dentry->d_inode)); + } else if (!req->r_reply_info.head->is_dentry) { + ihold(old_dentry->d_inode); + d_instantiate(dentry, old_dentry->d_inode); + } ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index a610d3d6748..f67b687550d 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -109,7 +109,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, err = ceph_mdsc_do_request(mdsc, NULL, req); inode = req->r_target_inode; if (inode) - igrab(inode); + ihold(inode); ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(-ESTALE); @@ -167,7 +167,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, err = ceph_mdsc_do_request(mdsc, NULL, req); inode = req->r_target_inode; if (inode) - igrab(inode); + ihold(inode); ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(err ? err : -ESTALE); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 203252d88d9..4698a5c553d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -191,7 +191,8 @@ int ceph_open(struct inode *inode, struct file *file) err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, parent_inode, req); if (!err) @@ -282,14 +283,13 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool align_to_pages, + int *checkeof, bool o_direct, unsigned long buf_align) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; int io_align, page_align; - int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ int left, pages_left; int read; struct page **page_pos; @@ -307,7 +307,7 @@ static int striped_read(struct inode *inode, io_align = off & ~PAGE_MASK; more: - if (align_to_pages) + if (o_direct) page_align = (pos - io_align + buf_align) & ~PAGE_MASK; else page_align = pos & ~PAGE_MASK; @@ -317,20 +317,19 @@ more: ci->i_truncate_seq, ci->i_truncate_size, page_pos, pages_left, page_align); - hit_stripe = this_len < left; - was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) ret = 0; + hit_stripe = this_len < left; + was_short = ret >= 0 && ret < this_len; dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); if (ret > 0) { - int didpages = - ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT; + int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; if (read < pos - off) { dout(" zero gap %llu to %llu\n", off + read, pos); - ceph_zero_page_vector_range(page_off + read, + ceph_zero_page_vector_range(page_align + read, pos - off - read, pages); } pos += ret; @@ -345,20 +344,22 @@ more: } if (was_short) { - /* was original extent fully inside i_size? */ - if (pos + left <= inode->i_size) { - dout("zero tail\n"); - ceph_zero_page_vector_range(page_off + read, len - read, + /* did we bounce off eof? */ + if (pos + left > inode->i_size) + *checkeof = 1; + + /* zero trailing bytes (inside i_size) */ + if (left > 0 && pos < inode->i_size) { + if (pos + left > inode->i_size) + left = inode->i_size - pos; + + dout("zero tail %d\n", left); + ceph_zero_page_vector_range(page_align + read, left, pages); - read = len; - goto out; + read += left; } - - /* check i_size */ - *checkeof = 1; } -out: if (ret >= 0) ret = read; dout("striped_read returns %d\n", ret); @@ -475,9 +476,6 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, else pos = *offset; - io_align = pos & ~PAGE_MASK; - buf_align = (unsigned long)data & ~PAGE_MASK; - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -501,6 +499,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, * boundary. this isn't atomic, unfortunately. :( */ more: + io_align = pos & ~PAGE_MASK; + buf_align = (unsigned long)data & ~PAGE_MASK; len = left; if (file->f_flags & O_DIRECT) { /* write from beginning of first page, regardless of @@ -590,6 +590,7 @@ out: pos += len; written += len; left -= len; + data += written; if (left) goto more; @@ -658,7 +659,7 @@ out: /* hit EOF or hole? */ if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, reading more\n"); + dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); read += ret; base += ret; len -= ret; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 70b6a4839c3..d8858e96ab1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1101,10 +1101,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); + ihold(in); } else if (ceph_ino(in) == vino.ino && ceph_snap(in) == vino.snap) { - igrab(in); + ihold(in); } else { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", dn, in, ceph_ino(in), ceph_snap(in), @@ -1144,7 +1144,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); + ihold(in); rinfo->head->is_dentry = 1; /* fool notrace handlers */ } @@ -1328,7 +1328,7 @@ void ceph_queue_writeback(struct inode *inode) if (queue_work(ceph_inode_to_client(inode)->wb_wq, &ceph_inode(inode)->i_wb_work)) { dout("ceph_queue_writeback %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_writeback %p failed\n", inode); } @@ -1353,7 +1353,7 @@ void ceph_queue_invalidate(struct inode *inode) if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, &ceph_inode(inode)->i_pg_inv_work)) { dout("ceph_queue_invalidate %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_invalidate %p failed\n", inode); } @@ -1477,7 +1477,7 @@ void ceph_queue_vmtruncate(struct inode *inode) if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_vmtruncate %p failed, pending=%d\n", inode, ci->i_truncate_pending); @@ -1738,7 +1738,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) __mark_inode_dirty(inode, inode_dirty_flags); if (mask) { - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; @@ -1779,7 +1780,8 @@ int ceph_do_getattr(struct inode *inode, int mask) req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_num_caps = 1; req->r_args.getattr.mask = cpu_to_le32(mask); err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 8888c9ba68d..ef0b5f48e13 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -73,7 +73,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -135,7 +136,8 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 476b329867d..80576d05d68 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -23,7 +23,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) @@ -32,11 +33,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, length = fl->fl_end - fl->fl_start + 1; dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d", (int)lock_type, + "length: %llu, wait: %d, type: %d", (int)lock_type, (int)operation, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type); - req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); @@ -70,7 +70,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, } ceph_mdsc_put_request(req); dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type, + "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, (int)operation, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type, err); return err; @@ -109,16 +109,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { - /* undo! This should only happen if the kernel detects - * local deadlock. */ + /* undo! This should only happen if + * the kernel detects local + * deadlock. */ ceph_lock_message(CEPH_LOCK_FCNTL, op, file, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on posix_lock_file, undid lock", err); + dout("got %d on posix_lock_file, undid lock", + err); } } - } else { - dout("mds returned error code %d", err); + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); } return err; } @@ -155,8 +159,11 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } - } else { - dout("mds error code %d", err); + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, CEPH_LOCK_UNLOCK, 0, fl); } return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 79743d146be..0c1d9175652 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1438,12 +1438,15 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, struct dentry *temp; char *path; int len, pos; + unsigned seq; if (dentry == NULL) return ERR_PTR(-EINVAL); retry: len = 0; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); for (temp = dentry; !IS_ROOT(temp);) { struct inode *inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) @@ -1455,10 +1458,12 @@ retry: len += 1 + temp->d_name.len; temp = temp->d_parent; if (temp == NULL) { + rcu_read_unlock(); pr_err("build_path corrupt dentry %p\n", dentry); return ERR_PTR(-EINVAL); } } + rcu_read_unlock(); if (len) len--; /* no leading '/' */ @@ -1467,9 +1472,12 @@ retry: return ERR_PTR(-ENOMEM); pos = len; path[pos] = 0; /* trailing null */ + rcu_read_lock(); for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { - struct inode *inode = temp->d_inode; + struct inode *inode; + spin_lock(&temp->d_lock); + inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { dout("build_path path+%d: %p SNAPDIR\n", pos, temp); @@ -1478,21 +1486,26 @@ retry: break; } else { pos -= temp->d_name.len; - if (pos < 0) + if (pos < 0) { + spin_unlock(&temp->d_lock); break; + } strncpy(path + pos, temp->d_name.name, temp->d_name.len); } + spin_unlock(&temp->d_lock); if (pos) path[--pos] = '/'; temp = temp->d_parent; if (temp == NULL) { + rcu_read_unlock(); pr_err("build_path corrupt dentry\n"); kfree(path); return ERR_PTR(-EINVAL); } } - if (pos != 0) { + rcu_read_unlock(); + if (pos != 0 || read_seqretry(&rename_lock, seq)) { pr_err("build_path did not end path lookup where " "expected, namelen is %d, pos is %d\n", len, pos); /* presumably this is only possible if racing with a diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 24067d68a55..54b14de2e72 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -722,7 +722,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); inode = &ci->vfs_inode; - igrab(inode); + ihold(inode); spin_unlock(&mdsc->snap_flush_lock); spin_lock(&inode->i_lock); __ceph_flush_snaps(ci, &session, 0); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f2b62869618..f42d730f1b6 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -665,7 +665,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_args.setxattr.flags = cpu_to_le32(flags); @@ -795,7 +796,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 1cd4c3a1862..f66cc162515 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -7,6 +7,7 @@ config CIFS select CRYPTO_MD5 select CRYPTO_HMAC select CRYPTO_ARC4 + select CRYPTO_ECB select CRYPTO_DES help This is the client VFS module for the Common Internet File System @@ -148,13 +149,13 @@ config CIFS_FSCACHE config CIFS_ACL bool "Provide CIFS ACL support (EXPERIMENTAL)" - depends on EXPERIMENTAL && CIFS_XATTR + depends on EXPERIMENTAL && CIFS_XATTR && KEYS help Allows to fetch CIFS/NTFS ACL from the server. The DACL blob is handed over to the application/caller. config CIFS_NFSD_EXPORT bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL + depends on CIFS && EXPERIMENTAL && BROKEN help Allows NFS server to export a CIFS mounted share (nfsd over cifs) diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index dd8584d35a1..545509c3313 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -92,7 +92,7 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data, break; default: - cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family); + cERROR(1, "Unknown network family '%d'", sa->sa_family); key_len = 0; break; } @@ -152,7 +152,7 @@ static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, sharename = extract_sharename(tcon->treeName); if (IS_ERR(sharename)) { - cFYI(1, "CIFS: couldn't extract sharename\n"); + cFYI(1, "%s: couldn't extract sharename\n", __func__); sharename = NULL; return 0; } @@ -302,7 +302,7 @@ static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) pagevec_init(&pvec, 0); first = 0; - cFYI(1, "cifs inode 0x%p now uncached", cifsi); + cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi); for (;;) { nr_pages = pagevec_lookup(&pvec, diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index ffb1459dc6e..7260e11e21f 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -42,6 +42,7 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ #define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ +#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index dfbd9f1f373..5a0ee7f2af0 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -184,7 +184,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, if (cifs_pdu == NULL || server == NULL) return -EINVAL; - if (cifs_pdu->Command == SMB_COM_NEGOTIATE) + if (!server->session_estab) return 0; if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) { diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 989442dcfb4..bc4b12ca537 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -35,6 +35,7 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/namei.h> #include <net/ipv6.h> #include "cifsfs.h" #include "cifspdu.h" @@ -104,8 +105,7 @@ cifs_sb_deactive(struct super_block *sb) } static int -cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, - const char *devname, int silent) +cifs_read_super(struct super_block *sb) { struct inode *inode; struct cifs_sb_info *cifs_sb; @@ -113,22 +113,16 @@ cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, cifs_sb = CIFS_SB(sb); - spin_lock_init(&cifs_sb->tlink_tree_lock); - cifs_sb->tlink_tree = RB_ROOT; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) + sb->s_flags |= MS_POSIXACL; - rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); - if (rc) - return rc; - - cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; - - rc = cifs_mount(sb, cifs_sb, volume_info, devname); + if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; - if (rc) { - if (!silent) - cERROR(1, "cifs_mount failed w/return code = %d", rc); - goto out_mount_failed; - } + /* BB FIXME fix time_gran to be larger for LANMAN sessions */ + sb->s_time_gran = 100; sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; @@ -170,37 +164,14 @@ out_no_root: if (inode) iput(inode); - cifs_umount(sb, cifs_sb); - -out_mount_failed: - bdi_destroy(&cifs_sb->bdi); return rc; } -static void -cifs_put_super(struct super_block *sb) +static void cifs_kill_sb(struct super_block *sb) { - int rc = 0; - struct cifs_sb_info *cifs_sb; - - cFYI(1, "In cifs_put_super"); - cifs_sb = CIFS_SB(sb); - if (cifs_sb == NULL) { - cFYI(1, "Empty cifs superblock info passed to unmount"); - return; - } - - rc = cifs_umount(sb, cifs_sb); - if (rc) - cERROR(1, "cifs_umount failed with return code %d", rc); - if (cifs_sb->mountdata) { - kfree(cifs_sb->mountdata); - cifs_sb->mountdata = NULL; - } - - unload_nls(cifs_sb->local_nls); - bdi_destroy(&cifs_sb->bdi); - kfree(cifs_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + kill_anon_super(sb); + cifs_umount(cifs_sb); } static int @@ -257,9 +228,6 @@ static int cifs_permission(struct inode *inode, int mask, unsigned int flags) { struct cifs_sb_info *cifs_sb; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - cifs_sb = CIFS_SB(inode->i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { @@ -352,6 +320,37 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) } } +static void +cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) +{ + seq_printf(s, ",sec="); + + switch (server->secType) { + case LANMAN: + seq_printf(s, "lanman"); + break; + case NTLMv2: + seq_printf(s, "ntlmv2"); + break; + case NTLM: + seq_printf(s, "ntlm"); + break; + case Kerberos: + seq_printf(s, "krb5"); + break; + case RawNTLMSSP: + seq_printf(s, "ntlmssp"); + break; + default: + /* shouldn't ever happen */ + seq_printf(s, "unknown"); + break; + } + + if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + seq_printf(s, "i"); +} + /* * cifs_show_options() is for displaying mount options in /proc/mounts. * Not all settable options are displayed but most of the important @@ -365,6 +364,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) struct sockaddr *srcaddr; srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; + cifs_show_security(s, tcon->ses->server); + seq_printf(s, ",unc=%s", tcon->treeName); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) @@ -518,7 +519,6 @@ static int cifs_drop_inode(struct inode *inode) } static const struct super_operations cifs_super_ops = { - .put_super = cifs_put_super, .statfs = cifs_statfs, .alloc_inode = cifs_alloc_inode, .destroy_inode = cifs_destroy_inode, @@ -543,91 +543,55 @@ static const struct super_operations cifs_super_ops = { static struct dentry * cifs_get_root(struct smb_vol *vol, struct super_block *sb) { - int xid, rc; - struct inode *inode; - struct qstr name; - struct dentry *dparent = NULL, *dchild = NULL, *alias; + struct dentry *dentry; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - unsigned int i, full_len, len; - char *full_path = NULL, *pstart; + char *full_path = NULL; + char *s, *p; char sep; + int xid; full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); if (full_path == NULL) - return NULL; + return ERR_PTR(-ENOMEM); cFYI(1, "Get root dentry for %s", full_path); xid = GetXid(); sep = CIFS_DIR_SEP(cifs_sb); - dparent = dget(sb->s_root); - full_len = strlen(full_path); - full_path[full_len] = sep; - pstart = full_path + 1; - - for (i = 1, len = 0; i <= full_len; i++) { - if (full_path[i] != sep || !len) { - len++; - continue; - } - - full_path[i] = 0; - cFYI(1, "get dentry for %s", pstart); - - name.name = pstart; - name.len = len; - name.hash = full_name_hash(pstart, len); - dchild = d_lookup(dparent, &name); - if (dchild == NULL) { - cFYI(1, "not exists"); - dchild = d_alloc(dparent, &name); - if (dchild == NULL) { - dput(dparent); - dparent = NULL; - goto out; - } - } - - cFYI(1, "get inode"); - if (dchild->d_inode == NULL) { - cFYI(1, "not exists"); - inode = NULL; - if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) - rc = cifs_get_inode_info_unix(&inode, full_path, - sb, xid); - else - rc = cifs_get_inode_info(&inode, full_path, - NULL, sb, xid, NULL); - if (rc) { - dput(dchild); - dput(dparent); - dparent = NULL; - goto out; - } - alias = d_materialise_unique(dchild, inode); - if (alias != NULL) { - dput(dchild); - if (IS_ERR(alias)) { - dput(dparent); - dparent = NULL; - goto out; - } - dchild = alias; - } - } - cFYI(1, "parent %p, child %p", dparent, dchild); - - dput(dparent); - dparent = dchild; - len = 0; - pstart = full_path + i + 1; - full_path[i] = sep; - } -out: + dentry = dget(sb->s_root); + p = s = full_path; + + do { + struct inode *dir = dentry->d_inode; + struct dentry *child; + + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + p = s++; + /* next separator */ + while (*s && *s != sep) + s++; + + mutex_lock(&dir->i_mutex); + child = lookup_one_len(p, dentry, s - p); + mutex_unlock(&dir->i_mutex); + dput(dentry); + dentry = child; + } while (!IS_ERR(dentry)); _FreeXid(xid); kfree(full_path); - return dparent; + return dentry; +} + +static int cifs_set_super(struct super_block *sb, void *data) +{ + struct cifs_mnt_data *mnt_data = data; + sb->s_fs_info = mnt_data->cifs_sb; + return set_anon_super(sb, NULL); } static struct dentry * @@ -643,82 +607,80 @@ cifs_do_mount(struct file_system_type *fs_type, cFYI(1, "Devname: %s flags: %d ", dev_name, flags); - rc = cifs_setup_volume_info(&volume_info, (char *)data, dev_name); - if (rc) - return ERR_PTR(rc); + volume_info = cifs_get_volume_info((char *)data, dev_name); + if (IS_ERR(volume_info)) + return ERR_CAST(volume_info); cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); if (cifs_sb == NULL) { root = ERR_PTR(-ENOMEM); - goto out; + goto out_nls; + } + + cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); + if (cifs_sb->mountdata == NULL) { + root = ERR_PTR(-ENOMEM); + goto out_cifs_sb; } cifs_setup_cifs_sb(volume_info, cifs_sb); + rc = cifs_mount(cifs_sb, volume_info); + if (rc) { + if (!(flags & MS_SILENT)) + cERROR(1, "cifs_mount failed w/return code = %d", rc); + root = ERR_PTR(rc); + goto out_mountdata; + } + mnt_data.vol = volume_info; mnt_data.cifs_sb = cifs_sb; mnt_data.flags = flags; - sb = sget(fs_type, cifs_match_super, set_anon_super, &mnt_data); + sb = sget(fs_type, cifs_match_super, cifs_set_super, &mnt_data); if (IS_ERR(sb)) { root = ERR_CAST(sb); - goto out_cifs_sb; + cifs_umount(cifs_sb); + goto out; } - if (sb->s_fs_info) { + if (sb->s_root) { cFYI(1, "Use existing superblock"); - goto out_shared; - } - - /* - * Copy mount params for use in submounts. Better to do - * the copy here and deal with the error before cleanup gets - * complicated post-mount. - */ - cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); - if (cifs_sb->mountdata == NULL) { - root = ERR_PTR(-ENOMEM); - goto out_super; - } - - sb->s_flags = flags; - /* BB should we make this contingent on mount parm? */ - sb->s_flags |= MS_NODIRATIME | MS_NOATIME; - sb->s_fs_info = cifs_sb; + cifs_umount(cifs_sb); + } else { + sb->s_flags = flags; + /* BB should we make this contingent on mount parm? */ + sb->s_flags |= MS_NODIRATIME | MS_NOATIME; + + rc = cifs_read_super(sb); + if (rc) { + root = ERR_PTR(rc); + goto out_super; + } - rc = cifs_read_super(sb, volume_info, dev_name, - flags & MS_SILENT ? 1 : 0); - if (rc) { - root = ERR_PTR(rc); - goto out_super; + sb->s_flags |= MS_ACTIVE; } - sb->s_flags |= MS_ACTIVE; - root = cifs_get_root(volume_info, sb); - if (root == NULL) + if (IS_ERR(root)) goto out_super; cFYI(1, "dentry root is: %p", root); goto out; -out_shared: - root = cifs_get_root(volume_info, sb); - if (root) - cFYI(1, "dentry root is: %p", root); - goto out; - out_super: - kfree(cifs_sb->mountdata); deactivate_locked_super(sb); +out: + cifs_cleanup_volume_info(volume_info); + return root; +out_mountdata: + kfree(cifs_sb->mountdata); out_cifs_sb: - unload_nls(cifs_sb->local_nls); kfree(cifs_sb); - -out: - cifs_cleanup_volume_info(&volume_info); - return root; +out_nls: + unload_nls(volume_info->local_nls); + goto out; } static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, @@ -807,7 +769,7 @@ struct file_system_type cifs_fs_type = { .owner = THIS_MODULE, .name = "cifs", .mount = cifs_do_mount, - .kill_sb = kill_anon_super, + .kill_sb = cifs_kill_sb, /* .fs_flags */ }; const struct inode_operations cifs_dir_inode_ops = { diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 64313f778eb..036ca83e5f4 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -129,5 +129,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.72" +#define CIFS_VERSION "1.74" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 953f84413c7..8df28e925e5 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -154,12 +154,11 @@ extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); -extern void cifs_cleanup_volume_info(struct smb_vol **pvolume_info); -extern int cifs_setup_volume_info(struct smb_vol **pvolume_info, - char *mount_data, const char *devname); -extern int cifs_mount(struct super_block *, struct cifs_sb_info *, - struct smb_vol *, const char *); -extern int cifs_umount(struct super_block *, struct cifs_sb_info *); +extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); +extern struct smb_vol *cifs_get_volume_info(char *mount_data, + const char *devname); +extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); +extern void cifs_umount(struct cifs_sb_info *); extern void cifs_dfs_release_automount_timer(void); void cifs_proc_init(void); void cifs_proc_clean(void); @@ -218,7 +217,8 @@ extern int get_dfs_path(int xid, struct cifs_ses *pSesInfo, struct dfs_info3_param **preferrals, int remap); extern void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, - struct super_block *sb, struct smb_vol *vol); + struct cifs_sb_info *cifs_sb, + struct smb_vol *vol); extern int CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData); extern int SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6d88b82537c..ccc1afa0bf3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -65,6 +65,8 @@ static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); +static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname); /* * cifs tcp session reconnection @@ -152,7 +154,7 @@ cifs_reconnect(struct TCP_Server_Info *server) mid_entry->callback(mid_entry); } - while (server->tcpStatus == CifsNeedReconnect) { + do { try_to_freeze(); /* we should try only the port we connected to before */ @@ -167,7 +169,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); } - } + } while (server->tcpStatus == CifsNeedReconnect); return rc; } @@ -784,7 +786,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, struct smb_vol *vol) { char *value, *data, *end; - char *mountdata_copy, *options; + char *mountdata_copy = NULL, *options; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; @@ -1391,7 +1393,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, "/proc/fs/cifs/LookupCacheEnabled to 0\n"); } else if (strnicmp(data, "fsc", 3) == 0) { #ifndef CONFIG_CIFS_FSCACHE - cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE" + cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE " "kernel config option set"); goto cifs_parse_mount_err; #endif @@ -1976,7 +1978,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) warned_on_ntlm = true; cERROR(1, "default security mechanism requested. The default " "security mechanism will be upgraded from ntlm to " - "ntlmv2 in kernel release 2.6.41"); + "ntlmv2 in kernel release 3.1"); } ses->overrideSecFlg = volume_info->secFlg; @@ -2149,7 +2151,10 @@ cifs_put_tlink(struct tcon_link *tlink) } static inline struct tcon_link * -cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb); +cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) +{ + return cifs_sb->master_tlink; +} static int compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) @@ -2237,8 +2242,8 @@ cifs_match_super(struct super_block *sb, void *data) rc = compare_mount_options(sb, mnt_data); out: - cifs_put_tlink(tlink); spin_unlock(&cifs_tcp_ses_lock); + cifs_put_tlink(tlink); return rc; } @@ -2471,14 +2476,6 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) return rc; - rc = socket->ops->connect(socket, saddr, slen, 0); - if (rc < 0) { - cFYI(1, "Error %d connecting to server", rc); - sock_release(socket); - server->ssocket = NULL; - return rc; - } - /* * Eventually check for other socket options to change from * the default. sock_setsockopt not used because it expects @@ -2507,6 +2504,14 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); + rc = socket->ops->connect(socket, saddr, slen, 0); + if (rc < 0) { + cFYI(1, "Error %d connecting to server", rc); + sock_release(socket); + server->ssocket = NULL; + return rc; + } + if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); @@ -2543,7 +2548,7 @@ ip_connect(struct TCP_Server_Info *server) } void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, - struct super_block *sb, struct smb_vol *vol_info) + struct cifs_sb_info *cifs_sb, struct smb_vol *vol_info) { /* if we are reconnecting then should we check to see if * any requested capabilities changed locally e.g. via @@ -2597,22 +2602,23 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, cap &= ~CIFS_UNIX_POSIX_ACL_CAP; else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { cFYI(1, "negotiated posix acl support"); - if (sb) - sb->s_flags |= MS_POSIXACL; + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= + CIFS_MOUNT_POSIXACL; } if (vol_info && vol_info->posix_paths == 0) cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { cFYI(1, "negotiate posix pathnames"); - if (sb) - CIFS_SB(sb)->mnt_cifs_flags |= + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; } - if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) { + if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) { if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { - CIFS_SB(sb)->rsize = 127 * 1024; + cifs_sb->rsize = 127 * 1024; cFYI(DBG2, "larger reads not supported by srv"); } } @@ -2659,6 +2665,9 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, { INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); + spin_lock_init(&cifs_sb->tlink_tree_lock); + cifs_sb->tlink_tree = RB_ROOT; + if (pvolume_info->rsize > CIFSMaxBufSize) { cERROR(1, "rsize %d too large, using MaxBufSize", pvolume_info->rsize); @@ -2747,21 +2756,21 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, /* * When the server supports very large writes via POSIX extensions, we can - * allow up to 2^24 - PAGE_CACHE_SIZE. + * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including + * the RFC1001 length. * * Note that this might make for "interesting" allocation problems during - * writeback however (as we have to allocate an array of pointers for the - * pages). A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * writeback however as we have to allocate an array of pointers for the + * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. */ -#define CIFS_MAX_WSIZE ((1<<24) - PAGE_CACHE_SIZE) +#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) /* - * When the server doesn't allow large posix writes, default to a wsize of - * 128k - PAGE_CACHE_SIZE -- one page less than the largest frame size - * described in RFC1001. This allows space for the header without going over - * that by default. + * When the server doesn't allow large posix writes, only allow a wsize of + * 128k minus the size of the WRITE_AND_X header. That allows for a write up + * to the maximum size described by RFC1002. */ -#define CIFS_MAX_RFC1001_WSIZE (128 * 1024 - PAGE_CACHE_SIZE) +#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4) /* * The default wsize is 1M. find_get_pages seems to return a maximum of 256 @@ -2780,11 +2789,18 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) /* can server support 24-bit write sizes? (via UNIX extensions) */ if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) - wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1001_WSIZE); + wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE); - /* no CAP_LARGE_WRITE_X? Limit it to 16 bits */ - if (!(server->capabilities & CAP_LARGE_WRITE_X)) - wsize = min_t(unsigned int, wsize, USHRT_MAX); + /* + * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set? + * Limit it to max buffer offered by the server, minus the size of the + * WRITEX header, not including the 4 byte RFC1001 length. + */ + if (!(server->capabilities & CAP_LARGE_WRITE_X) || + (!(server->capabilities & CAP_UNIX) && + (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)))) + wsize = min_t(unsigned int, wsize, + server->maxBuf - sizeof(WRITE_REQ) + 4); /* hard limit of CIFS_MAX_WSIZE */ wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE); @@ -2816,15 +2832,9 @@ is_path_accessible(int xid, struct cifs_tcon *tcon, return rc; } -void -cifs_cleanup_volume_info(struct smb_vol **pvolume_info) +static void +cleanup_volume_info_contents(struct smb_vol *volume_info) { - struct smb_vol *volume_info; - - if (!pvolume_info || !*pvolume_info) - return; - - volume_info = *pvolume_info; kfree(volume_info->username); kzfree(volume_info->password); kfree(volume_info->UNC); @@ -2832,28 +2842,44 @@ cifs_cleanup_volume_info(struct smb_vol **pvolume_info) kfree(volume_info->domainname); kfree(volume_info->iocharset); kfree(volume_info->prepath); +} + +void +cifs_cleanup_volume_info(struct smb_vol *volume_info) +{ + if (!volume_info) + return; + cleanup_volume_info_contents(volume_info); kfree(volume_info); - *pvolume_info = NULL; - return; } + #ifdef CONFIG_CIFS_DFS_UPCALL /* build_path_to_root returns full path to root when * we do not have an exiting connection (tcon) */ static char * -build_unc_path_to_root(const struct smb_vol *volume_info, +build_unc_path_to_root(const struct smb_vol *vol, const struct cifs_sb_info *cifs_sb) { - char *full_path; + char *full_path, *pos; + unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0; + unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); - int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1); - full_path = kmalloc(unc_len + 1, GFP_KERNEL); + full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); if (full_path == NULL) return ERR_PTR(-ENOMEM); - strncpy(full_path, volume_info->UNC, unc_len); - full_path[unc_len] = 0; /* add trailing null */ + strncpy(full_path, vol->UNC, unc_len); + pos = full_path + unc_len; + + if (pplen) { + strncpy(pos, vol->prepath, pplen); + pos += pplen; + } + + *pos = '\0'; /* add trailing null */ convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); + cFYI(1, "%s: full_path=%s", __func__, full_path); return full_path; } @@ -2896,15 +2922,18 @@ expand_dfs_referral(int xid, struct cifs_ses *pSesInfo, &fake_devname); free_dfs_info_array(referrals, num_referrals); - kfree(fake_devname); - - if (cifs_sb->mountdata != NULL) - kfree(cifs_sb->mountdata); if (IS_ERR(mdata)) { rc = PTR_ERR(mdata); mdata = NULL; + } else { + cleanup_volume_info_contents(volume_info); + memset(volume_info, '\0', sizeof(*volume_info)); + rc = cifs_setup_volume_info(volume_info, mdata, + fake_devname); } + kfree(fake_devname); + kfree(cifs_sb->mountdata); cifs_sb->mountdata = mdata; } kfree(full_path); @@ -2912,29 +2941,20 @@ expand_dfs_referral(int xid, struct cifs_ses *pSesInfo, } #endif -int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, - const char *devname) +static int +cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname) { - struct smb_vol *volume_info; int rc = 0; - *pvolume_info = NULL; - - volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); - if (!volume_info) { - rc = -ENOMEM; - goto out; - } - - if (cifs_parse_mount_options(mount_data, devname, - volume_info)) { - rc = -EINVAL; - goto out; - } + if (cifs_parse_mount_options(mount_data, devname, volume_info)) + return -EINVAL; if (volume_info->nullauth) { cFYI(1, "null user"); - volume_info->username = ""; + volume_info->username = kzalloc(1, GFP_KERNEL); + if (volume_info->username == NULL) + return -ENOMEM; } else if (volume_info->username) { /* BB fixme parse for domain name here */ cFYI(1, "Username: %s", volume_info->username); @@ -2942,8 +2962,7 @@ int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, cifserror("No username specified"); /* In userspace mount helper we can get user name from alternate locations such as env variables and files on disk */ - rc = -EINVAL; - goto out; + return -EINVAL; } /* this is needed for ASCII cp to Unicode converts */ @@ -2955,21 +2974,34 @@ int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, if (volume_info->local_nls == NULL) { cERROR(1, "CIFS mount error: iocharset %s not found", volume_info->iocharset); - rc = -ELIBACC; - goto out; + return -ELIBACC; } } - *pvolume_info = volume_info; - return rc; -out: - cifs_cleanup_volume_info(&volume_info); return rc; } +struct smb_vol * +cifs_get_volume_info(char *mount_data, const char *devname) +{ + int rc; + struct smb_vol *volume_info; + + volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); + if (!volume_info) + return ERR_PTR(-ENOMEM); + + rc = cifs_setup_volume_info(volume_info, mount_data, devname); + if (rc) { + cifs_cleanup_volume_info(volume_info); + volume_info = ERR_PTR(rc); + } + + return volume_info; +} + int -cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, - struct smb_vol *volume_info, const char *devname) +cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { int rc = 0; int xid; @@ -2980,6 +3012,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, struct tcon_link *tlink; #ifdef CONFIG_CIFS_DFS_UPCALL int referral_walks_count = 0; +#endif + + rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); + if (rc) + return rc; + + cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; + +#ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ if (referral_walks_count) { @@ -2988,7 +3029,6 @@ try_mount_again: else if (pSesInfo) cifs_put_smb_ses(pSesInfo); - cifs_cleanup_volume_info(&volume_info); FreeXid(xid); } #endif @@ -3004,6 +3044,7 @@ try_mount_again: srvTcp = cifs_get_tcp_session(volume_info); if (IS_ERR(srvTcp)) { rc = PTR_ERR(srvTcp); + bdi_destroy(&cifs_sb->bdi); goto out; } @@ -3015,14 +3056,6 @@ try_mount_again: goto mount_fail_check; } - if (pSesInfo->capabilities & CAP_LARGE_FILES) - sb->s_maxbytes = MAX_LFS_FILESIZE; - else - sb->s_maxbytes = MAX_NON_LFS; - - /* BB FIXME fix time_gran to be larger for LANMAN sessions */ - sb->s_time_gran = 100; - /* search for existing tcon to this server share */ tcon = cifs_get_tcon(pSesInfo, volume_info); if (IS_ERR(tcon)) { @@ -3035,7 +3068,7 @@ try_mount_again: if (tcon->ses->capabilities & CAP_UNIX) { /* reset of caps checks mount to see if unix extensions disabled for just this mount */ - reset_cifs_unix_caps(xid, tcon, sb, volume_info); + reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info); if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && (le64_to_cpu(tcon->fsUnixInfo.Capability) & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) { @@ -3158,6 +3191,7 @@ mount_fail_check: cifs_put_smb_ses(pSesInfo); else cifs_put_tcp_session(srvTcp); + bdi_destroy(&cifs_sb->bdi); goto out; } @@ -3171,6 +3205,10 @@ out: return rc; } +/* + * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon + * pointer may be NULL. + */ int CIFSTCon(unsigned int xid, struct cifs_ses *ses, const char *tree, struct cifs_tcon *tcon, @@ -3205,7 +3243,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, pSMB->AndXCommand = 0xFF; pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); bcc_ptr = &pSMB->Password[0]; - if ((ses->server->sec_mode) & SECMODE_USER) { + if (!tcon || (ses->server->sec_mode & SECMODE_USER)) { pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ *bcc_ptr = 0; /* password is null byte */ bcc_ptr++; /* skip password */ @@ -3328,8 +3366,8 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, return rc; } -int -cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) +void +cifs_umount(struct cifs_sb_info *cifs_sb) { struct rb_root *root = &cifs_sb->tlink_tree; struct rb_node *node; @@ -3350,7 +3388,10 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) } spin_unlock(&cifs_sb->tlink_tree_lock); - return 0; + bdi_destroy(&cifs_sb->bdi); + kfree(cifs_sb->mountdata); + unload_nls(cifs_sb->local_nls); + kfree(cifs_sb); } int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses) @@ -3371,7 +3412,7 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses) } if (rc == 0) { spin_lock(&GlobalMid_Lock); - if (server->tcpStatus != CifsExiting) + if (server->tcpStatus == CifsNeedNegotiate) server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; @@ -3444,7 +3485,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) goto out; } - snprintf(username, MAX_USERNAME_SIZE, "krb50x%x", fsuid); + snprintf(username, sizeof(username), "krb50x%x", fsuid); vol_info->username = username; vol_info->local_nls = cifs_sb->local_nls; vol_info->linux_uid = fsuid; @@ -3484,12 +3525,6 @@ out: return tcon; } -static inline struct tcon_link * -cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) -{ - return cifs_sb->master_tlink; -} - struct cifs_tcon * cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 81914df47ef..fa8c21d913b 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -55,6 +55,7 @@ build_path_from_dentry(struct dentry *direntry) char dirsep; struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + unsigned seq; if (direntry == NULL) return NULL; /* not much we can do if dentry is freed and @@ -68,22 +69,29 @@ build_path_from_dentry(struct dentry *direntry) dfsplen = 0; cifs_bp_rename_retry: namelen = dfsplen; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); for (temp = direntry; !IS_ROOT(temp);) { namelen += (1 + temp->d_name.len); temp = temp->d_parent; if (temp == NULL) { cERROR(1, "corrupt dentry"); + rcu_read_unlock(); return NULL; } } + rcu_read_unlock(); full_path = kmalloc(namelen+1, GFP_KERNEL); if (full_path == NULL) return full_path; full_path[namelen] = 0; /* trailing null */ + rcu_read_lock(); for (temp = direntry; !IS_ROOT(temp);) { + spin_lock(&temp->d_lock); namelen -= 1 + temp->d_name.len; if (namelen < 0) { + spin_unlock(&temp->d_lock); break; } else { full_path[namelen] = dirsep; @@ -91,14 +99,17 @@ cifs_bp_rename_retry: temp->d_name.len); cFYI(0, "name: %s", full_path + namelen); } + spin_unlock(&temp->d_lock); temp = temp->d_parent; if (temp == NULL) { cERROR(1, "corrupt dentry"); + rcu_read_unlock(); kfree(full_path); return NULL; } } - if (namelen != dfsplen) { + rcu_read_unlock(); + if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { cERROR(1, "did not end path lookup where expected namelen is %d", namelen); /* presumably this is only possible if racing with a rename diff --git a/fs/cifs/file.c b/fs/cifs/file.c index bb71471a4d9..a9b4a24f2a1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1737,7 +1737,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, io_parms.pid = pid; io_parms.tcon = pTcon; io_parms.offset = *poffset; - io_parms.length = len; + io_parms.length = cur_len; rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &read_data, &buf_type); pSMBr = (struct smb_com_read_rsp *)read_data; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index d368a47ba5e..42e5363b410 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -28,14 +28,14 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, &cifs_fscache_server_index_def, server); - cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server, - server->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server, + server->fscache); } void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) { - cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server, - server->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server, + server->fscache); fscache_relinquish_cookie(server->fscache, 0); server->fscache = NULL; } @@ -47,13 +47,13 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) tcon->fscache = fscache_acquire_cookie(server->fscache, &cifs_fscache_super_index_def, tcon); - cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)", - server->fscache, tcon->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache, + tcon->fscache); } void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { - cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache); + cFYI(1, "%s: (0x%p)", __func__, tcon->fscache); fscache_relinquish_cookie(tcon->fscache, 0); tcon->fscache = NULL; } @@ -70,8 +70,8 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { cifsi->fscache = fscache_acquire_cookie(tcon->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, - cifsi->fscache); + cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__, + tcon->fscache, cifsi->fscache); } } @@ -80,8 +80,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "CIFS releasing inode cookie (0x%p)", - cifsi->fscache); + cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); fscache_relinquish_cookie(cifsi->fscache, 0); cifsi->fscache = NULL; } @@ -92,8 +91,8 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "CIFS disabling inode cookie (0x%p)", - cifsi->fscache); + cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); + fscache_uncache_all_inode_pages(cifsi->fscache, inode); fscache_relinquish_cookie(cifsi->fscache, 1); cifsi->fscache = NULL; } @@ -121,8 +120,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) cifs_sb_master_tcon(cifs_sb)->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p", - cifsi->fscache, old); + cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p", + __func__, cifsi->fscache, old); } } @@ -132,8 +131,8 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) struct inode *inode = page->mapping->host; struct cifsInodeInfo *cifsi = CIFS_I(inode); - cFYI(1, "CIFS: fscache release page (0x%p/0x%p)", - page, cifsi->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, page, + cifsi->fscache); if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) return 0; } @@ -144,8 +143,7 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, int error) { - cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)", - page, error); + cFYI(1, "%s: (0x%p/%d)", __func__, page, error); if (!error) SetPageUptodate(page); unlock_page(page); @@ -158,7 +156,7 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p", + cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, cifs_readpage_from_fscache_complete, @@ -167,11 +165,11 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) switch (ret) { case 0: /* page found in fscache, read submitted */ - cFYI(1, "CIFS: readpage_from_fscache: submitted"); + cFYI(1, "%s: submitted", __func__); return ret; case -ENOBUFS: /* page won't be cached */ case -ENODATA: /* page not in cache */ - cFYI(1, "CIFS: readpage_from_fscache %d", ret); + cFYI(1, "%s: %d", __func__, ret); return 1; default: @@ -190,7 +188,7 @@ int __cifs_readpages_from_fscache(struct inode *inode, { int ret; - cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)", + cFYI(1, "%s: (0x%p/%u/0x%p)", __func__, CIFS_I(inode)->fscache, *nr_pages, inode); ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, pages, nr_pages, @@ -199,12 +197,12 @@ int __cifs_readpages_from_fscache(struct inode *inode, mapping_gfp_mask(mapping)); switch (ret) { case 0: /* read submitted to the cache for all pages */ - cFYI(1, "CIFS: readpages_from_fscache: submitted"); + cFYI(1, "%s: submitted", __func__); return ret; case -ENOBUFS: /* some pages are not cached and can't be */ case -ENODATA: /* some pages are not cached */ - cFYI(1, "CIFS: readpages_from_fscache: no page"); + cFYI(1, "%s: no page", __func__); return 1; default: @@ -218,7 +216,7 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p", + cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); if (ret != 0) @@ -230,7 +228,7 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); struct fscache_cookie *cookie = cifsi->fscache; - cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie); + cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie); fscache_wait_on_page_write(cookie, page); fscache_uncache_page(cookie, page); } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3892ab817a3..d3e619692ee 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -428,8 +428,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { flags |= NTLMSSP_NEGOTIATE_SIGN; if (!ses->server->session_estab) - flags |= NTLMSSP_NEGOTIATE_KEY_XCH | - NTLMSSP_NEGOTIATE_EXTENDED_SEC; + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; } sec_blob->NegotiateFlags = cpu_to_le32(flags); @@ -465,10 +464,11 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { flags |= NTLMSSP_NEGOTIATE_SIGN; - if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED) - flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; + if (!ses->server->session_estab) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + } tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 1525d5e662b..1c5b770c314 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -90,12 +90,10 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) sg_init_one(&sgout, out, 8); rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); - if (rc) { + if (rc) cERROR(1, "could not encrypt crypt key rc: %d\n", rc); - crypto_free_blkcipher(tfm_des); - goto smbhash_err; - } + crypto_free_blkcipher(tfm_des); smbhash_err: return rc; } diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 6cbb3afb36d..cb140ef293e 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -43,8 +43,6 @@ const struct file_operations coda_ioctl_operations = { /* the coda pioctl inode ops */ static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; return (mask & MAY_EXEC) ? -EACCES : 0; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index e141939080f..739fb59bcdc 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -37,7 +37,7 @@ static DEFINE_MUTEX(read_mutex); /* These macros may change in future, to provide better st_ino semantics. */ #define OFFSET(x) ((x)->i_ino) -static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset) +static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset) { if (!cino->offset) return offset + 1; @@ -61,7 +61,7 @@ static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset) } static struct inode *get_cramfs_inode(struct super_block *sb, - struct cramfs_inode *cramfs_inode, unsigned int offset) + const struct cramfs_inode *cramfs_inode, unsigned int offset) { struct inode *inode; static struct timespec zerotime; @@ -317,7 +317,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) /* Set it all up.. */ sb->s_op = &cramfs_ops; root = get_cramfs_inode(sb, &super.root, 0); - if (!root) + if (IS_ERR(root)) goto out; sb->s_root = d_alloc_root(root); if (!sb->s_root) { @@ -423,6 +423,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { unsigned int offset = 0; + struct inode *inode = NULL; int sorted; mutex_lock(&read_mutex); @@ -449,8 +450,8 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s for (;;) { if (!namelen) { - mutex_unlock(&read_mutex); - return ERR_PTR(-EIO); + inode = ERR_PTR(-EIO); + goto out; } if (name[namelen-1]) break; @@ -462,17 +463,18 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s if (retval > 0) continue; if (!retval) { - struct cramfs_inode entry = *de; - mutex_unlock(&read_mutex); - d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off)); - return NULL; + inode = get_cramfs_inode(dir->i_sb, de, dir_off); + break; } /* else (retval < 0) */ if (sorted) break; } +out: mutex_unlock(&read_mutex); - d_add(dentry, NULL); + if (IS_ERR(inode)) + return ERR_CAST(inode); + d_add(dentry, inode); return NULL; } diff --git a/fs/dcache.c b/fs/dcache.c index 37f72ee5bf7..fbdcbca4072 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1813,8 +1813,6 @@ seqretry: tname = dentry->d_name.name; i = dentry->d_inode; prefetch(tname); - if (i) - prefetch(i); /* * This seqcount check is required to ensure name and * len are loaded atomically, so as not to walk off the @@ -2213,14 +2211,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * The hash value has to match the hash queue that the dentry is on.. */ /* - * d_move - move a dentry + * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. + * dcache entries should not be moved in this way. Caller hold + * rename_lock. */ -void d_move(struct dentry * dentry, struct dentry * target) +static void __d_move(struct dentry * dentry, struct dentry * target) { if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -2228,8 +2227,6 @@ void d_move(struct dentry * dentry, struct dentry * target) BUG_ON(d_ancestor(dentry, target)); BUG_ON(d_ancestor(target, dentry)); - write_seqlock(&rename_lock); - dentry_lock_for_move(dentry, target); write_seqcount_begin(&dentry->d_seq); @@ -2275,6 +2272,20 @@ void d_move(struct dentry * dentry, struct dentry * target) spin_unlock(&target->d_lock); fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); +} + +/* + * d_move - move a dentry + * @dentry: entry to move + * @target: new dentry + * + * Update the dcache to reflect the move of a file name. Negative + * dcache entries should not be moved in this way. + */ +void d_move(struct dentry *dentry, struct dentry *target) +{ + write_seqlock(&rename_lock); + __d_move(dentry, target); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); @@ -2302,7 +2313,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex and the inode->i_lock + * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... @@ -2317,11 +2328,6 @@ static struct dentry *__d_unalias(struct inode *inode, if (alias->d_parent == dentry->d_parent) goto out_unalias; - /* Check for loops */ - ret = ERR_PTR(-ELOOP); - if (d_ancestor(alias, dentry)) - goto out_err; - /* See lock_rename() */ ret = ERR_PTR(-EBUSY); if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) @@ -2331,7 +2337,7 @@ static struct dentry *__d_unalias(struct inode *inode, goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: - d_move(alias, dentry); + __d_move(alias, dentry); ret = alias; out_err: spin_unlock(&inode->i_lock); @@ -2416,15 +2422,24 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) alias = __d_find_alias(inode, 0); if (alias) { actual = alias; - /* Is this an anonymous mountpoint that we could splice - * into our tree? */ - if (IS_ROOT(alias)) { + write_seqlock(&rename_lock); + + if (d_ancestor(alias, dentry)) { + /* Check for loops */ + actual = ERR_PTR(-ELOOP); + } else if (IS_ROOT(alias)) { + /* Is this an anonymous mountpoint that we + * could splice into our tree? */ __d_materialise_dentry(dentry, alias); + write_sequnlock(&rename_lock); __d_drop(alias); goto found; + } else { + /* Nope, but we must(!) avoid directory + * aliasing */ + actual = __d_unalias(inode, dentry, alias); } - /* Nope, but we must(!) avoid directory aliasing */ - actual = __d_unalias(inode, dentry, alias); + write_sequnlock(&rename_lock); if (IS_ERR(actual)) dput(alias); goto out_nolock; diff --git a/fs/dcookies.c b/fs/dcookies.c index a21cabdbd87..dda0dc702d1 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -178,6 +178,8 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) /* FIXME: (deleted) ? */ path = d_path(&dcs->path, kbuf, PAGE_SIZE); + mutex_unlock(&dcookie_mutex); + if (IS_ERR(path)) { err = PTR_ERR(path); goto out_free; @@ -194,6 +196,7 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) out_free: kfree(kbuf); + return err; out: mutex_unlock(&dcookie_mutex); return err; diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index abc49f29245..90e5997262e 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -14,17 +14,9 @@ #include "dlm_internal.h" #include "lock.h" #include "user.h" -#include "ast.h" - -#define WAKE_ASTS 0 - -static uint64_t ast_seq_count; -static struct list_head ast_queue; -static spinlock_t ast_queue_lock; -static struct task_struct * astd_task; -static unsigned long astd_wakeflags; -static struct mutex astd_running; +static uint64_t dlm_cb_seq; +static spinlock_t dlm_cb_seq_spin; static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) { @@ -57,21 +49,13 @@ static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) } } -void dlm_del_ast(struct dlm_lkb *lkb) -{ - spin_lock(&ast_queue_lock); - if (!list_empty(&lkb->lkb_astqueue)) - list_del_init(&lkb->lkb_astqueue); - spin_unlock(&ast_queue_lock); -} - int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags, uint64_t seq) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; uint64_t prev_seq; int prev_mode; - int i; + int i, rv; for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { if (lkb->lkb_callbacks[i].seq) @@ -100,7 +84,8 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, mode, (unsigned long long)prev_seq, prev_mode); - return 0; + rv = 0; + goto out; } } @@ -109,6 +94,7 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, lkb->lkb_callbacks[i].mode = mode; lkb->lkb_callbacks[i].sb_status = status; lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF); + rv = 0; break; } @@ -117,21 +103,24 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, lkb->lkb_id, (unsigned long long)seq, flags, mode, status, sbflags); dlm_dump_lkb_callbacks(lkb); - return -1; + rv = -1; + goto out; } - - return 0; + out: + return rv; } int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_callback *cb, int *resid) { - int i; + int i, rv; *resid = 0; - if (!lkb->lkb_callbacks[0].seq) - return -ENOENT; + if (!lkb->lkb_callbacks[0].seq) { + rv = -ENOENT; + goto out; + } /* oldest undelivered cb is callbacks[0] */ @@ -163,7 +152,8 @@ int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, cb->mode, (unsigned long long)lkb->lkb_last_cast.seq, lkb->lkb_last_cast.mode); - return 0; + rv = 0; + goto out; } } @@ -176,171 +166,150 @@ int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback)); lkb->lkb_last_bast_time = ktime_get(); } - - return 0; + rv = 0; + out: + return rv; } -void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, - uint32_t sbflags) +void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags) { - uint64_t seq; + struct dlm_ls *ls = lkb->lkb_resource->res_ls; + uint64_t new_seq, prev_seq; int rv; - spin_lock(&ast_queue_lock); - - seq = ++ast_seq_count; + spin_lock(&dlm_cb_seq_spin); + new_seq = ++dlm_cb_seq; + spin_unlock(&dlm_cb_seq_spin); if (lkb->lkb_flags & DLM_IFL_USER) { - spin_unlock(&ast_queue_lock); - dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq); + dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq); return; } - rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); - if (rv < 0) { - spin_unlock(&ast_queue_lock); - return; - } + mutex_lock(&lkb->lkb_cb_mutex); + prev_seq = lkb->lkb_callbacks[0].seq; - if (list_empty(&lkb->lkb_astqueue)) { + rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq); + if (rv < 0) + goto out; + + if (!prev_seq) { kref_get(&lkb->lkb_ref); - list_add_tail(&lkb->lkb_astqueue, &ast_queue); - } - spin_unlock(&ast_queue_lock); - set_bit(WAKE_ASTS, &astd_wakeflags); - wake_up_process(astd_task); + if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) { + mutex_lock(&ls->ls_cb_mutex); + list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay); + mutex_unlock(&ls->ls_cb_mutex); + } else { + queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); + } + } + out: + mutex_unlock(&lkb->lkb_cb_mutex); } -static void process_asts(void) +void dlm_callback_work(struct work_struct *work) { - struct dlm_ls *ls = NULL; - struct dlm_rsb *r = NULL; - struct dlm_lkb *lkb; + struct dlm_lkb *lkb = container_of(work, struct dlm_lkb, lkb_cb_work); + struct dlm_ls *ls = lkb->lkb_resource->res_ls; void (*castfn) (void *astparam); void (*bastfn) (void *astparam, int mode); struct dlm_callback callbacks[DLM_CALLBACKS_SIZE]; int i, rv, resid; -repeat: - spin_lock(&ast_queue_lock); - list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { - r = lkb->lkb_resource; - ls = r->res_ls; + memset(&callbacks, 0, sizeof(callbacks)); - if (dlm_locking_stopped(ls)) - continue; - - /* we remove from astqueue list and remove everything in - lkb_callbacks before releasing the spinlock so empty - lkb_astqueue is always consistent with empty lkb_callbacks */ - - list_del_init(&lkb->lkb_astqueue); - - castfn = lkb->lkb_astfn; - bastfn = lkb->lkb_bastfn; + mutex_lock(&lkb->lkb_cb_mutex); + if (!lkb->lkb_callbacks[0].seq) { + /* no callback work exists, shouldn't happen */ + log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id); + dlm_print_lkb(lkb); + dlm_dump_lkb_callbacks(lkb); + } - memset(&callbacks, 0, sizeof(callbacks)); + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); + if (rv < 0) + break; + } - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); - if (rv < 0) - break; - } - spin_unlock(&ast_queue_lock); + if (resid) { + /* cbs remain, loop should have removed all, shouldn't happen */ + log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id, + resid); + dlm_print_lkb(lkb); + dlm_dump_lkb_callbacks(lkb); + } + mutex_unlock(&lkb->lkb_cb_mutex); - if (resid) { - /* shouldn't happen, for loop should have removed all */ - log_error(ls, "callback resid %d lkb %x", - resid, lkb->lkb_id); - } + castfn = lkb->lkb_astfn; + bastfn = lkb->lkb_bastfn; - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - if (!callbacks[i].seq) - break; - if (callbacks[i].flags & DLM_CB_SKIP) { - continue; - } else if (callbacks[i].flags & DLM_CB_BAST) { - bastfn(lkb->lkb_astparam, callbacks[i].mode); - } else if (callbacks[i].flags & DLM_CB_CAST) { - lkb->lkb_lksb->sb_status = callbacks[i].sb_status; - lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; - castfn(lkb->lkb_astparam); - } + for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { + if (!callbacks[i].seq) + break; + if (callbacks[i].flags & DLM_CB_SKIP) { + continue; + } else if (callbacks[i].flags & DLM_CB_BAST) { + bastfn(lkb->lkb_astparam, callbacks[i].mode); + } else if (callbacks[i].flags & DLM_CB_CAST) { + lkb->lkb_lksb->sb_status = callbacks[i].sb_status; + lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + castfn(lkb->lkb_astparam); } - - /* removes ref for ast_queue, may cause lkb to be freed */ - dlm_put_lkb(lkb); - - cond_resched(); - goto repeat; } - spin_unlock(&ast_queue_lock); -} - -static inline int no_asts(void) -{ - int ret; - spin_lock(&ast_queue_lock); - ret = list_empty(&ast_queue); - spin_unlock(&ast_queue_lock); - return ret; + /* undo kref_get from dlm_add_callback, may cause lkb to be freed */ + dlm_put_lkb(lkb); } -static int dlm_astd(void *data) +int dlm_callback_start(struct dlm_ls *ls) { - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - if (!test_bit(WAKE_ASTS, &astd_wakeflags)) - schedule(); - set_current_state(TASK_RUNNING); - - mutex_lock(&astd_running); - if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags)) - process_asts(); - mutex_unlock(&astd_running); + ls->ls_callback_wq = alloc_workqueue("dlm_callback", + WQ_UNBOUND | + WQ_MEM_RECLAIM | + WQ_NON_REENTRANT, + 0); + if (!ls->ls_callback_wq) { + log_print("can't start dlm_callback workqueue"); + return -ENOMEM; } return 0; } -void dlm_astd_wake(void) +void dlm_callback_stop(struct dlm_ls *ls) { - if (!no_asts()) { - set_bit(WAKE_ASTS, &astd_wakeflags); - wake_up_process(astd_task); - } + if (ls->ls_callback_wq) + destroy_workqueue(ls->ls_callback_wq); } -int dlm_astd_start(void) +void dlm_callback_suspend(struct dlm_ls *ls) { - struct task_struct *p; - int error = 0; - - INIT_LIST_HEAD(&ast_queue); - spin_lock_init(&ast_queue_lock); - mutex_init(&astd_running); - - p = kthread_run(dlm_astd, NULL, "dlm_astd"); - if (IS_ERR(p)) - error = PTR_ERR(p); - else - astd_task = p; - return error; -} + set_bit(LSFL_CB_DELAY, &ls->ls_flags); -void dlm_astd_stop(void) -{ - kthread_stop(astd_task); + if (ls->ls_callback_wq) + flush_workqueue(ls->ls_callback_wq); } -void dlm_astd_suspend(void) +void dlm_callback_resume(struct dlm_ls *ls) { - mutex_lock(&astd_running); -} + struct dlm_lkb *lkb, *safe; + int count = 0; -void dlm_astd_resume(void) -{ - mutex_unlock(&astd_running); + clear_bit(LSFL_CB_DELAY, &ls->ls_flags); + + if (!ls->ls_callback_wq) + return; + + mutex_lock(&ls->ls_cb_mutex); + list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { + list_del_init(&lkb->lkb_cb_list); + queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); + count++; + } + mutex_unlock(&ls->ls_cb_mutex); + + log_debug(ls, "dlm_callback_resume %d", count); } diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index 8aa89c9b561..757b551c682 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -18,14 +18,15 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags, uint64_t seq); int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_callback *cb, int *resid); -void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, - uint32_t sbflags); +void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, + uint32_t sbflags); -void dlm_astd_wake(void); -int dlm_astd_start(void); -void dlm_astd_stop(void); -void dlm_astd_suspend(void); -void dlm_astd_resume(void); +void dlm_callback_work(struct work_struct *work); +int dlm_callback_start(struct dlm_ls *ls); +void dlm_callback_stop(struct dlm_ls *ls); +void dlm_callback_suspend(struct dlm_ls *ls); +void dlm_callback_resume(struct dlm_ls *ls); #endif + diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 9b026ea8baa..6cf72fcc0d0 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -28,7 +28,8 @@ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight * /config/dlm/<cluster>/comms/<comm>/nodeid * /config/dlm/<cluster>/comms/<comm>/local - * /config/dlm/<cluster>/comms/<comm>/addr + * /config/dlm/<cluster>/comms/<comm>/addr (write only) + * /config/dlm/<cluster>/comms/<comm>/addr_list (read only) * The <cluster> level is useless, but I haven't figured out how to avoid it. */ @@ -80,6 +81,7 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, size_t len); static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len); +static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf); static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf); static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, size_t len); @@ -92,7 +94,6 @@ struct dlm_cluster { unsigned int cl_tcp_port; unsigned int cl_buffer_size; unsigned int cl_rsbtbl_size; - unsigned int cl_lkbtbl_size; unsigned int cl_dirtbl_size; unsigned int cl_recover_timer; unsigned int cl_toss_secs; @@ -101,13 +102,13 @@ struct dlm_cluster { unsigned int cl_protocol; unsigned int cl_timewarn_cs; unsigned int cl_waitwarn_us; + unsigned int cl_new_rsb_count; }; enum { CLUSTER_ATTR_TCP_PORT = 0, CLUSTER_ATTR_BUFFER_SIZE, CLUSTER_ATTR_RSBTBL_SIZE, - CLUSTER_ATTR_LKBTBL_SIZE, CLUSTER_ATTR_DIRTBL_SIZE, CLUSTER_ATTR_RECOVER_TIMER, CLUSTER_ATTR_TOSS_SECS, @@ -116,6 +117,7 @@ enum { CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_TIMEWARN_CS, CLUSTER_ATTR_WAITWARN_US, + CLUSTER_ATTR_NEW_RSB_COUNT, }; struct cluster_attribute { @@ -160,7 +162,6 @@ __CONFIGFS_ATTR(name, 0644, name##_read, name##_write) CLUSTER_ATTR(tcp_port, 1); CLUSTER_ATTR(buffer_size, 1); CLUSTER_ATTR(rsbtbl_size, 1); -CLUSTER_ATTR(lkbtbl_size, 1); CLUSTER_ATTR(dirtbl_size, 1); CLUSTER_ATTR(recover_timer, 1); CLUSTER_ATTR(toss_secs, 1); @@ -169,12 +170,12 @@ CLUSTER_ATTR(log_debug, 0); CLUSTER_ATTR(protocol, 0); CLUSTER_ATTR(timewarn_cs, 1); CLUSTER_ATTR(waitwarn_us, 0); +CLUSTER_ATTR(new_rsb_count, 0); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr, [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr, - [CLUSTER_ATTR_LKBTBL_SIZE] = &cluster_attr_lkbtbl_size.attr, [CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr, [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr, [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, @@ -183,6 +184,7 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, + [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr, NULL, }; @@ -190,6 +192,7 @@ enum { COMM_ATTR_NODEID = 0, COMM_ATTR_LOCAL, COMM_ATTR_ADDR, + COMM_ATTR_ADDR_LIST, }; struct comm_attribute { @@ -217,14 +220,22 @@ static struct comm_attribute comm_attr_local = { static struct comm_attribute comm_attr_addr = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "addr", - .ca_mode = S_IRUGO | S_IWUSR }, + .ca_mode = S_IWUSR }, .store = comm_addr_write, }; +static struct comm_attribute comm_attr_addr_list = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "addr_list", + .ca_mode = S_IRUGO }, + .show = comm_addr_list_read, +}; + static struct configfs_attribute *comm_attrs[] = { [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr, [COMM_ATTR_LOCAL] = &comm_attr_local.attr, [COMM_ATTR_ADDR] = &comm_attr_addr.attr, + [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list.attr, NULL, }; @@ -435,7 +446,6 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_tcp_port = dlm_config.ci_tcp_port; cl->cl_buffer_size = dlm_config.ci_buffer_size; cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; - cl->cl_lkbtbl_size = dlm_config.ci_lkbtbl_size; cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size; cl->cl_recover_timer = dlm_config.ci_recover_timer; cl->cl_toss_secs = dlm_config.ci_toss_secs; @@ -444,6 +454,7 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_protocol = dlm_config.ci_protocol; cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; + cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; space_list = &sps->ss_group; comm_list = &cms->cs_group; @@ -720,6 +731,50 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) return len; } +static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf) +{ + ssize_t s; + ssize_t allowance; + int i; + struct sockaddr_storage *addr; + struct sockaddr_in *addr_in; + struct sockaddr_in6 *addr_in6; + + /* Taken from ip6_addr_string() defined in lib/vsprintf.c */ + char buf0[sizeof("AF_INET6 xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255\n")]; + + + /* Derived from SIMPLE_ATTR_SIZE of fs/configfs/file.c */ + allowance = 4096; + buf[0] = '\0'; + + for (i = 0; i < cm->addr_count; i++) { + addr = cm->addr[i]; + + switch(addr->ss_family) { + case AF_INET: + addr_in = (struct sockaddr_in *)addr; + s = sprintf(buf0, "AF_INET %pI4\n", &addr_in->sin_addr.s_addr); + break; + case AF_INET6: + addr_in6 = (struct sockaddr_in6 *)addr; + s = sprintf(buf0, "AF_INET6 %pI6\n", &addr_in6->sin6_addr); + break; + default: + s = sprintf(buf0, "%s\n", "<UNKNOWN>"); + break; + } + allowance -= s; + if (allowance >= 0) + strcat(buf, buf0); + else { + allowance += s; + break; + } + } + return 4096 - allowance; +} + static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, char *buf) { @@ -983,7 +1038,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_TCP_PORT 21064 #define DEFAULT_BUFFER_SIZE 4096 #define DEFAULT_RSBTBL_SIZE 1024 -#define DEFAULT_LKBTBL_SIZE 1024 #define DEFAULT_DIRTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 @@ -992,12 +1046,12 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_PROTOCOL 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 +#define DEFAULT_NEW_RSB_COUNT 128 struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, .ci_buffer_size = DEFAULT_BUFFER_SIZE, .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, - .ci_lkbtbl_size = DEFAULT_LKBTBL_SIZE, .ci_dirtbl_size = DEFAULT_DIRTBL_SIZE, .ci_recover_timer = DEFAULT_RECOVER_TIMER, .ci_toss_secs = DEFAULT_TOSS_SECS, @@ -1005,6 +1059,7 @@ struct dlm_config_info dlm_config = { .ci_log_debug = DEFAULT_LOG_DEBUG, .ci_protocol = DEFAULT_PROTOCOL, .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, - .ci_waitwarn_us = DEFAULT_WAITWARN_US + .ci_waitwarn_us = DEFAULT_WAITWARN_US, + .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index dd0ce24d5a8..3099d0dd26c 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -20,7 +20,6 @@ struct dlm_config_info { int ci_tcp_port; int ci_buffer_size; int ci_rsbtbl_size; - int ci_lkbtbl_size; int ci_dirtbl_size; int ci_recover_timer; int ci_toss_secs; @@ -29,6 +28,7 @@ struct dlm_config_info { int ci_protocol; int ci_timewarn_cs; int ci_waitwarn_us; + int ci_new_rsb_count; }; extern struct dlm_config_info dlm_config; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 0262451eb9c..fe2860c0244 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -37,6 +37,7 @@ #include <linux/jhash.h> #include <linux/miscdevice.h> #include <linux/mutex.h> +#include <linux/idr.h> #include <asm/uaccess.h> #include <linux/dlm.h> @@ -52,7 +53,6 @@ struct dlm_ls; struct dlm_lkb; struct dlm_rsb; struct dlm_member; -struct dlm_lkbtable; struct dlm_rsbtable; struct dlm_dirtable; struct dlm_direntry; @@ -108,11 +108,6 @@ struct dlm_rsbtable { spinlock_t lock; }; -struct dlm_lkbtable { - struct list_head list; - rwlock_t lock; - uint16_t counter; -}; /* * Lockspace member (per node in a ls) @@ -248,17 +243,18 @@ struct dlm_lkb { int8_t lkb_wait_count; int lkb_wait_nodeid; /* for debugging */ - struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ struct list_head lkb_statequeue; /* rsb g/c/w list */ struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ struct list_head lkb_wait_reply; /* waiting for remote reply */ - struct list_head lkb_astqueue; /* need ast to be sent */ struct list_head lkb_ownqueue; /* list of locks for a process */ struct list_head lkb_time_list; ktime_t lkb_timestamp; ktime_t lkb_wait_time; unsigned long lkb_timeout_cs; + struct mutex lkb_cb_mutex; + struct work_struct lkb_cb_work; + struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */ struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; struct dlm_callback lkb_last_cast; struct dlm_callback lkb_last_bast; @@ -299,7 +295,7 @@ struct dlm_rsb { int res_recover_locks_count; char *res_lvbptr; - char res_name[1]; + char res_name[DLM_RESNAME_MAXLEN+1]; }; /* find_rsb() flags */ @@ -465,12 +461,12 @@ struct dlm_ls { unsigned long ls_scan_time; struct kobject ls_kobj; + struct idr ls_lkbidr; + spinlock_t ls_lkbidr_spin; + struct dlm_rsbtable *ls_rsbtbl; uint32_t ls_rsbtbl_size; - struct dlm_lkbtable *ls_lkbtbl; - uint32_t ls_lkbtbl_size; - struct dlm_dirtable *ls_dirtbl; uint32_t ls_dirtbl_size; @@ -483,6 +479,10 @@ struct dlm_ls { struct mutex ls_timeout_mutex; struct list_head ls_timeout; + spinlock_t ls_new_rsb_spin; + int ls_new_rsb_count; + struct list_head ls_new_rsb; /* new rsb structs */ + struct list_head ls_nodes; /* current nodes in ls */ struct list_head ls_nodes_gone; /* dead node list, recovery */ int ls_num_nodes; /* number of nodes in ls */ @@ -506,8 +506,12 @@ struct dlm_ls { struct miscdevice ls_device; + struct workqueue_struct *ls_callback_wq; + /* recovery related */ + struct mutex ls_cb_mutex; + struct list_head ls_cb_delay; /* save for queue_work later */ struct timer_list ls_timer; struct task_struct *ls_recoverd_task; struct mutex ls_recoverd_active; @@ -544,6 +548,7 @@ struct dlm_ls { #define LSFL_RCOM_WAIT 4 #define LSFL_UEVENT_WAIT 5 #define LSFL_TIMEWARN 6 +#define LSFL_CB_DELAY 7 /* much of this is just saving user space pointers associated with the lock that we pass back to the user lib with an ast */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index f71d0b5abd9..83b5e32514e 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -305,7 +305,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) rv = -EDEADLK; } - dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags); + dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags); } static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -319,7 +319,7 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) if (is_master_copy(lkb)) { send_bast(r, lkb, rqmode); } else { - dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0); + dlm_add_cb(lkb, DLM_CB_BAST, rqmode, 0, 0); } } @@ -327,19 +327,68 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) * Basic operations on rsb's and lkb's */ -static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len) +static int pre_rsb_struct(struct dlm_ls *ls) +{ + struct dlm_rsb *r1, *r2; + int count = 0; + + spin_lock(&ls->ls_new_rsb_spin); + if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) { + spin_unlock(&ls->ls_new_rsb_spin); + return 0; + } + spin_unlock(&ls->ls_new_rsb_spin); + + r1 = dlm_allocate_rsb(ls); + r2 = dlm_allocate_rsb(ls); + + spin_lock(&ls->ls_new_rsb_spin); + if (r1) { + list_add(&r1->res_hashchain, &ls->ls_new_rsb); + ls->ls_new_rsb_count++; + } + if (r2) { + list_add(&r2->res_hashchain, &ls->ls_new_rsb); + ls->ls_new_rsb_count++; + } + count = ls->ls_new_rsb_count; + spin_unlock(&ls->ls_new_rsb_spin); + + if (!count) + return -ENOMEM; + return 0; +} + +/* If ls->ls_new_rsb is empty, return -EAGAIN, so the caller can + unlock any spinlocks, go back and call pre_rsb_struct again. + Otherwise, take an rsb off the list and return it. */ + +static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, + struct dlm_rsb **r_ret) { struct dlm_rsb *r; + int count; - r = dlm_allocate_rsb(ls, len); - if (!r) - return NULL; + spin_lock(&ls->ls_new_rsb_spin); + if (list_empty(&ls->ls_new_rsb)) { + count = ls->ls_new_rsb_count; + spin_unlock(&ls->ls_new_rsb_spin); + log_debug(ls, "find_rsb retry %d %d %s", + count, dlm_config.ci_new_rsb_count, name); + return -EAGAIN; + } + + r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain); + list_del(&r->res_hashchain); + ls->ls_new_rsb_count--; + spin_unlock(&ls->ls_new_rsb_spin); r->res_ls = ls; r->res_length = len; memcpy(r->res_name, name, len); mutex_init(&r->res_mutex); + INIT_LIST_HEAD(&r->res_hashchain); INIT_LIST_HEAD(&r->res_lookup); INIT_LIST_HEAD(&r->res_grantqueue); INIT_LIST_HEAD(&r->res_convertqueue); @@ -347,7 +396,8 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len) INIT_LIST_HEAD(&r->res_root_list); INIT_LIST_HEAD(&r->res_recover_list); - return r; + *r_ret = r; + return 0; } static int search_rsb_list(struct list_head *head, char *name, int len, @@ -405,16 +455,6 @@ static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, return error; } -static int search_rsb(struct dlm_ls *ls, char *name, int len, int b, - unsigned int flags, struct dlm_rsb **r_ret) -{ - int error; - spin_lock(&ls->ls_rsbtbl[b].lock); - error = _search_rsb(ls, name, len, b, flags, r_ret); - spin_unlock(&ls->ls_rsbtbl[b].lock); - return error; -} - /* * Find rsb in rsbtbl and potentially create/add one * @@ -432,35 +472,48 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b, static int find_rsb(struct dlm_ls *ls, char *name, int namelen, unsigned int flags, struct dlm_rsb **r_ret) { - struct dlm_rsb *r = NULL, *tmp; + struct dlm_rsb *r = NULL; uint32_t hash, bucket; - int error = -EINVAL; + int error; - if (namelen > DLM_RESNAME_MAXLEN) + if (namelen > DLM_RESNAME_MAXLEN) { + error = -EINVAL; goto out; + } if (dlm_no_directory(ls)) flags |= R_CREATE; - error = 0; hash = jhash(name, namelen, 0); bucket = hash & (ls->ls_rsbtbl_size - 1); - error = search_rsb(ls, name, namelen, bucket, flags, &r); + retry: + if (flags & R_CREATE) { + error = pre_rsb_struct(ls); + if (error < 0) + goto out; + } + + spin_lock(&ls->ls_rsbtbl[bucket].lock); + + error = _search_rsb(ls, name, namelen, bucket, flags, &r); if (!error) - goto out; + goto out_unlock; if (error == -EBADR && !(flags & R_CREATE)) - goto out; + goto out_unlock; /* the rsb was found but wasn't a master copy */ if (error == -ENOTBLK) - goto out; + goto out_unlock; - error = -ENOMEM; - r = create_rsb(ls, name, namelen); - if (!r) - goto out; + error = get_rsb_struct(ls, name, namelen, &r); + if (error == -EAGAIN) { + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + goto retry; + } + if (error) + goto out_unlock; r->res_hash = hash; r->res_bucket = bucket; @@ -474,18 +527,10 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, nodeid = 0; r->res_nodeid = nodeid; } - - spin_lock(&ls->ls_rsbtbl[bucket].lock); - error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); - if (!error) { - spin_unlock(&ls->ls_rsbtbl[bucket].lock); - dlm_free_rsb(r); - r = tmp; - goto out; - } list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); - spin_unlock(&ls->ls_rsbtbl[bucket].lock); error = 0; + out_unlock: + spin_unlock(&ls->ls_rsbtbl[bucket].lock); out: *r_ret = r; return error; @@ -580,9 +625,8 @@ static void detach_lkb(struct dlm_lkb *lkb) static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) { - struct dlm_lkb *lkb, *tmp; - uint32_t lkid = 0; - uint16_t bucket; + struct dlm_lkb *lkb; + int rv, id; lkb = dlm_allocate_lkb(ls); if (!lkb) @@ -594,60 +638,42 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); INIT_LIST_HEAD(&lkb->lkb_time_list); - INIT_LIST_HEAD(&lkb->lkb_astqueue); + INIT_LIST_HEAD(&lkb->lkb_cb_list); + mutex_init(&lkb->lkb_cb_mutex); + INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); - get_random_bytes(&bucket, sizeof(bucket)); - bucket &= (ls->ls_lkbtbl_size - 1); - - write_lock(&ls->ls_lkbtbl[bucket].lock); + retry: + rv = idr_pre_get(&ls->ls_lkbidr, GFP_NOFS); + if (!rv) + return -ENOMEM; - /* counter can roll over so we must verify lkid is not in use */ + spin_lock(&ls->ls_lkbidr_spin); + rv = idr_get_new_above(&ls->ls_lkbidr, lkb, 1, &id); + if (!rv) + lkb->lkb_id = id; + spin_unlock(&ls->ls_lkbidr_spin); - while (lkid == 0) { - lkid = (bucket << 16) | ls->ls_lkbtbl[bucket].counter++; + if (rv == -EAGAIN) + goto retry; - list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list, - lkb_idtbl_list) { - if (tmp->lkb_id != lkid) - continue; - lkid = 0; - break; - } + if (rv < 0) { + log_error(ls, "create_lkb idr error %d", rv); + return rv; } - lkb->lkb_id = lkid; - list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list); - write_unlock(&ls->ls_lkbtbl[bucket].lock); - *lkb_ret = lkb; return 0; } -static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid) -{ - struct dlm_lkb *lkb; - uint16_t bucket = (lkid >> 16); - - list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) { - if (lkb->lkb_id == lkid) - return lkb; - } - return NULL; -} - static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; - uint16_t bucket = (lkid >> 16); - - if (bucket >= ls->ls_lkbtbl_size) - return -EBADSLT; - read_lock(&ls->ls_lkbtbl[bucket].lock); - lkb = __find_lkb(ls, lkid); + spin_lock(&ls->ls_lkbidr_spin); + lkb = idr_find(&ls->ls_lkbidr, lkid); if (lkb) kref_get(&lkb->lkb_ref); - read_unlock(&ls->ls_lkbtbl[bucket].lock); + spin_unlock(&ls->ls_lkbidr_spin); *lkb_ret = lkb; return lkb ? 0 : -ENOENT; @@ -668,12 +694,12 @@ static void kill_lkb(struct kref *kref) static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) { - uint16_t bucket = (lkb->lkb_id >> 16); + uint32_t lkid = lkb->lkb_id; - write_lock(&ls->ls_lkbtbl[bucket].lock); + spin_lock(&ls->ls_lkbidr_spin); if (kref_put(&lkb->lkb_ref, kill_lkb)) { - list_del(&lkb->lkb_idtbl_list); - write_unlock(&ls->ls_lkbtbl[bucket].lock); + idr_remove(&ls->ls_lkbidr, lkid); + spin_unlock(&ls->ls_lkbidr_spin); detach_lkb(lkb); @@ -683,7 +709,7 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) dlm_free_lkb(lkb); return 1; } else { - write_unlock(&ls->ls_lkbtbl[bucket].lock); + spin_unlock(&ls->ls_lkbidr_spin); return 0; } } @@ -849,9 +875,7 @@ void dlm_scan_waiters(struct dlm_ls *ls) if (!num_nodes) { num_nodes = ls->ls_num_nodes; - warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int)); - if (warned) - memset(warned, 0, num_nodes * sizeof(int)); + warned = kzalloc(num_nodes * sizeof(int), GFP_KERNEL); } if (!warned) continue; @@ -863,9 +887,7 @@ void dlm_scan_waiters(struct dlm_ls *ls) dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); } mutex_unlock(&ls->ls_waiters_mutex); - - if (warned) - kfree(warned); + kfree(warned); if (debug_expired) log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", @@ -2401,9 +2423,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) if (deadlk) { /* it's left on the granted queue */ - log_debug(r->res_ls, "deadlock %x node %d sts%d g%d r%d %s", - lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_status, - lkb->lkb_grmode, lkb->lkb_rqmode, r->res_name); revert_lock(r, lkb); queue_cast(r, lkb, -EDEADLK); error = -EDEADLK; @@ -3993,8 +4012,6 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) default: log_error(ls, "unknown message type %d", ms->m_type); } - - dlm_astd_wake(); } /* If the lockspace is in recovery mode (locking stopped), then normal @@ -4133,7 +4150,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) struct dlm_message *ms_stub; int wait_type, stub_unlock_result, stub_cancel_result; - ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message)); + ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); if (!ms_stub) { log_error(ls, "dlm_recover_waiters_pre no mem"); return; @@ -4809,7 +4826,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, goto out_put; spin_lock(&ua->proc->locks_spin); - /* dlm_user_add_ast() may have already taken lkb off the proc list */ + /* dlm_user_add_cb() may have already taken lkb off the proc list */ if (!list_empty(&lkb->lkb_ownqueue)) list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); spin_unlock(&ua->proc->locks_spin); @@ -4946,7 +4963,7 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) /* We have to release clear_proc_locks mutex before calling unlock_proc_lock() (which does lock_rsb) due to deadlock with receiving a message that does - lock_rsb followed by dlm_user_add_ast() */ + lock_rsb followed by dlm_user_add_cb() */ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, struct dlm_user_proc *proc) @@ -4969,7 +4986,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, return lkb; } -/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which +/* The ls_clear_proc_locks mutex protects against dlm_user_add_cb() which 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts, which we clear here. */ @@ -5011,10 +5028,10 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } - list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { memset(&lkb->lkb_callbacks, 0, sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); - list_del_init(&lkb->lkb_astqueue); + list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } @@ -5053,10 +5070,10 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) spin_unlock(&proc->locks_spin); spin_lock(&proc->asts_spin); - list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { + list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { memset(&lkb->lkb_callbacks, 0, sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); - list_del_init(&lkb->lkb_astqueue); + list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } spin_unlock(&proc->asts_spin); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 14cbf409975..a1d8f1af144 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -15,7 +15,6 @@ #include "lockspace.h" #include "member.h" #include "recoverd.h" -#include "ast.h" #include "dir.h" #include "lowcomms.h" #include "config.h" @@ -24,6 +23,7 @@ #include "recover.h" #include "requestqueue.h" #include "user.h" +#include "ast.h" static int ls_count; static struct mutex ls_lock; @@ -359,17 +359,10 @@ static int threads_start(void) { int error; - /* Thread which process lock requests for all lockspace's */ - error = dlm_astd_start(); - if (error) { - log_print("cannot start dlm_astd thread %d", error); - goto fail; - } - error = dlm_scand_start(); if (error) { log_print("cannot start dlm_scand thread %d", error); - goto astd_fail; + goto fail; } /* Thread for sending/receiving messages for all lockspace's */ @@ -383,8 +376,6 @@ static int threads_start(void) scand_fail: dlm_scand_stop(); - astd_fail: - dlm_astd_stop(); fail: return error; } @@ -393,7 +384,6 @@ static void threads_stop(void) { dlm_scand_stop(); dlm_lowcomms_stop(); - dlm_astd_stop(); } static int new_lockspace(const char *name, int namelen, void **lockspace, @@ -463,7 +453,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; - ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS); + ls->ls_rsbtbl = vmalloc(sizeof(struct dlm_rsbtable) * size); if (!ls->ls_rsbtbl) goto out_lsfree; for (i = 0; i < size; i++) { @@ -472,22 +462,13 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, spin_lock_init(&ls->ls_rsbtbl[i].lock); } - size = dlm_config.ci_lkbtbl_size; - ls->ls_lkbtbl_size = size; - - ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS); - if (!ls->ls_lkbtbl) - goto out_rsbfree; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list); - rwlock_init(&ls->ls_lkbtbl[i].lock); - ls->ls_lkbtbl[i].counter = 1; - } + idr_init(&ls->ls_lkbidr); + spin_lock_init(&ls->ls_lkbidr_spin); size = dlm_config.ci_dirtbl_size; ls->ls_dirtbl_size = size; - ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS); + ls->ls_dirtbl = vmalloc(sizeof(struct dlm_dirtable) * size); if (!ls->ls_dirtbl) goto out_lkbfree; for (i = 0; i < size; i++) { @@ -502,6 +483,9 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, INIT_LIST_HEAD(&ls->ls_timeout); mutex_init(&ls->ls_timeout_mutex); + INIT_LIST_HEAD(&ls->ls_new_rsb); + spin_lock_init(&ls->ls_new_rsb_spin); + INIT_LIST_HEAD(&ls->ls_nodes); INIT_LIST_HEAD(&ls->ls_nodes_gone); ls->ls_num_nodes = 0; @@ -520,6 +504,9 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, init_completion(&ls->ls_members_done); ls->ls_members_result = -1; + mutex_init(&ls->ls_cb_mutex); + INIT_LIST_HEAD(&ls->ls_cb_delay); + ls->ls_recoverd_task = NULL; mutex_init(&ls->ls_recoverd_active); spin_lock_init(&ls->ls_recover_lock); @@ -553,18 +540,26 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, list_add(&ls->ls_list, &lslist); spin_unlock(&lslist_lock); + if (flags & DLM_LSFL_FS) { + error = dlm_callback_start(ls); + if (error) { + log_error(ls, "can't start dlm_callback %d", error); + goto out_delist; + } + } + /* needs to find ls in lslist */ error = dlm_recoverd_start(ls); if (error) { log_error(ls, "can't start dlm_recoverd %d", error); - goto out_delist; + goto out_callback; } ls->ls_kobj.kset = dlm_kset; error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL, "%s", ls->ls_name); if (error) - goto out_stop; + goto out_recoverd; kobject_uevent(&ls->ls_kobj, KOBJ_ADD); /* let kobject handle freeing of ls if there's an error */ @@ -578,7 +573,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, error = do_uevent(ls, 1); if (error) - goto out_stop; + goto out_recoverd; wait_for_completion(&ls->ls_members_done); error = ls->ls_members_result; @@ -595,19 +590,20 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, do_uevent(ls, 0); dlm_clear_members(ls); kfree(ls->ls_node_array); - out_stop: + out_recoverd: dlm_recoverd_stop(ls); + out_callback: + dlm_callback_stop(ls); out_delist: spin_lock(&lslist_lock); list_del(&ls->ls_list); spin_unlock(&lslist_lock); kfree(ls->ls_recover_buf); out_dirfree: - kfree(ls->ls_dirtbl); + vfree(ls->ls_dirtbl); out_lkbfree: - kfree(ls->ls_lkbtbl); - out_rsbfree: - kfree(ls->ls_rsbtbl); + idr_destroy(&ls->ls_lkbidr); + vfree(ls->ls_rsbtbl); out_lsfree: if (do_unreg) kobject_put(&ls->ls_kobj); @@ -641,50 +637,64 @@ int dlm_new_lockspace(const char *name, int namelen, void **lockspace, return error; } -/* Return 1 if the lockspace still has active remote locks, - * 2 if the lockspace still has active local locks. - */ -static int lockspace_busy(struct dlm_ls *ls) -{ - int i, lkb_found = 0; - struct dlm_lkb *lkb; - - /* NOTE: We check the lockidtbl here rather than the resource table. - This is because there may be LKBs queued as ASTs that have been - unlinked from their RSBs and are pending deletion once the AST has - been delivered */ - - for (i = 0; i < ls->ls_lkbtbl_size; i++) { - read_lock(&ls->ls_lkbtbl[i].lock); - if (!list_empty(&ls->ls_lkbtbl[i].list)) { - lkb_found = 1; - list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list, - lkb_idtbl_list) { - if (!lkb->lkb_nodeid) { - read_unlock(&ls->ls_lkbtbl[i].lock); - return 2; - } - } - } - read_unlock(&ls->ls_lkbtbl[i].lock); +static int lkb_idr_is_local(int id, void *p, void *data) +{ + struct dlm_lkb *lkb = p; + + if (!lkb->lkb_nodeid) + return 1; + return 0; +} + +static int lkb_idr_is_any(int id, void *p, void *data) +{ + return 1; +} + +static int lkb_idr_free(int id, void *p, void *data) +{ + struct dlm_lkb *lkb = p; + + if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) + dlm_free_lvb(lkb->lkb_lvbptr); + + dlm_free_lkb(lkb); + return 0; +} + +/* NOTE: We check the lkbidr here rather than the resource table. + This is because there may be LKBs queued as ASTs that have been unlinked + from their RSBs and are pending deletion once the AST has been delivered */ + +static int lockspace_busy(struct dlm_ls *ls, int force) +{ + int rv; + + spin_lock(&ls->ls_lkbidr_spin); + if (force == 0) { + rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_any, ls); + } else if (force == 1) { + rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_local, ls); + } else { + rv = 0; } - return lkb_found; + spin_unlock(&ls->ls_lkbidr_spin); + return rv; } static int release_lockspace(struct dlm_ls *ls, int force) { - struct dlm_lkb *lkb; struct dlm_rsb *rsb; struct list_head *head; int i, busy, rv; - busy = lockspace_busy(ls); + busy = lockspace_busy(ls, force); spin_lock(&lslist_lock); if (ls->ls_create_count == 1) { - if (busy > force) + if (busy) { rv = -EBUSY; - else { + } else { /* remove_lockspace takes ls off lslist */ ls->ls_create_count = 0; rv = 0; @@ -708,12 +718,12 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_recoverd_stop(ls); + dlm_callback_stop(ls); + remove_lockspace(ls); dlm_delete_debug_file(ls); - dlm_astd_suspend(); - kfree(ls->ls_recover_buf); /* @@ -721,31 +731,15 @@ static int release_lockspace(struct dlm_ls *ls, int force) */ dlm_dir_clear(ls); - kfree(ls->ls_dirtbl); + vfree(ls->ls_dirtbl); /* - * Free all lkb's on lkbtbl[] lists. + * Free all lkb's in idr */ - for (i = 0; i < ls->ls_lkbtbl_size; i++) { - head = &ls->ls_lkbtbl[i].list; - while (!list_empty(head)) { - lkb = list_entry(head->next, struct dlm_lkb, - lkb_idtbl_list); - - list_del(&lkb->lkb_idtbl_list); - - dlm_del_ast(lkb); - - if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) - dlm_free_lvb(lkb->lkb_lvbptr); - - dlm_free_lkb(lkb); - } - } - dlm_astd_resume(); - - kfree(ls->ls_lkbtbl); + idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls); + idr_remove_all(&ls->ls_lkbidr); + idr_destroy(&ls->ls_lkbidr); /* * Free all rsb's on rsbtbl[] lists @@ -770,7 +764,14 @@ static int release_lockspace(struct dlm_ls *ls, int force) } } - kfree(ls->ls_rsbtbl); + vfree(ls->ls_rsbtbl); + + while (!list_empty(&ls->ls_new_rsb)) { + rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, + res_hashchain); + list_del(&rsb->res_hashchain); + dlm_free_rsb(rsb); + } /* * Free structures on any other lists diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5e2c71f05e4..990626e7da8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -512,12 +512,10 @@ static void process_sctp_notification(struct connection *con, } make_sockaddr(&prim.ssp_addr, 0, &addr_len); if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) { - int i; unsigned char *b=(unsigned char *)&prim.ssp_addr; log_print("reject connect from unknown addr"); - for (i=0; i<sizeof(struct sockaddr_storage);i++) - printk("%02x ", b[i]); - printk("\n"); + print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, + b, sizeof(struct sockaddr_storage)); sctp_send_shutdown(prim.ssp_assoc_id); return; } @@ -748,7 +746,10 @@ static int tcp_accept_from_sock(struct connection *con) /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) { + unsigned char *b=(unsigned char *)&peeraddr; log_print("connect from non cluster node"); + print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, + b, sizeof(struct sockaddr_storage)); sock_release(newsock); mutex_unlock(&con->sock_mutex); return -1; diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 8e0d00db004..da64df7576e 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -16,6 +16,7 @@ #include "memory.h" static struct kmem_cache *lkb_cache; +static struct kmem_cache *rsb_cache; int __init dlm_memory_init(void) @@ -26,6 +27,14 @@ int __init dlm_memory_init(void) __alignof__(struct dlm_lkb), 0, NULL); if (!lkb_cache) ret = -ENOMEM; + + rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), + __alignof__(struct dlm_rsb), 0, NULL); + if (!rsb_cache) { + kmem_cache_destroy(lkb_cache); + ret = -ENOMEM; + } + return ret; } @@ -33,6 +42,8 @@ void dlm_memory_exit(void) { if (lkb_cache) kmem_cache_destroy(lkb_cache); + if (rsb_cache) + kmem_cache_destroy(rsb_cache); } char *dlm_allocate_lvb(struct dlm_ls *ls) @@ -48,16 +59,11 @@ void dlm_free_lvb(char *p) kfree(p); } -/* FIXME: have some minimal space built-in to rsb for the name and - kmalloc a separate name if needed, like dentries are done */ - -struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls) { struct dlm_rsb *r; - DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); - - r = kzalloc(sizeof(*r) + namelen, GFP_NOFS); + r = kmem_cache_zalloc(rsb_cache, GFP_NOFS); return r; } @@ -65,7 +71,7 @@ void dlm_free_rsb(struct dlm_rsb *r) { if (r->res_lvbptr) dlm_free_lvb(r->res_lvbptr); - kfree(r); + kmem_cache_free(rsb_cache, r); } struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 485fb29143b..177c11cbb0a 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -16,7 +16,7 @@ int dlm_memory_init(void); void dlm_memory_exit(void); -struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen); +struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls); void dlm_free_rsb(struct dlm_rsb *r); struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); void dlm_free_lkb(struct dlm_lkb *l); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index fd677c8c3d3..774da3cf92c 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -58,13 +58,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) mutex_lock(&ls->ls_recoverd_active); - /* - * Suspending and resuming dlm_astd ensures that no lkb's from this ls - * will be processed by dlm_astd during recovery. - */ - - dlm_astd_suspend(); - dlm_astd_resume(); + dlm_callback_suspend(ls); /* * Free non-master tossed rsb's. Master rsb's are kept on toss @@ -202,6 +196,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_adjust_timeouts(ls); + dlm_callback_resume(ls); + error = enable_locking(ls, rv->seq); if (error) { log_debug(ls, "enable_locking failed %d", error); @@ -222,8 +218,6 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_grant_after_purge(ls); - dlm_astd_wake(); - log_debug(ls, "recover %llx done: %u ms", (unsigned long long)rv->seq, jiffies_to_msecs(jiffies - start)); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index e96bf3e9be8..d8ea6075640 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -213,9 +213,9 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, goto out; } - if (list_empty(&lkb->lkb_astqueue)) { + if (list_empty(&lkb->lkb_cb_list)) { kref_get(&lkb->lkb_ref); - list_add_tail(&lkb->lkb_astqueue, &proc->asts); + list_add_tail(&lkb->lkb_cb_list, &proc->asts); wake_up_interruptible(&proc->wait); } spin_unlock(&proc->asts_spin); @@ -832,24 +832,24 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, } /* if we empty lkb_callbacks, we don't want to unlock the spinlock - without removing lkb_astqueue; so empty lkb_astqueue is always + without removing lkb_cb_list; so empty lkb_cb_list is always consistent with empty lkb_callbacks */ - lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); + lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list); rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); if (rv < 0) { /* this shouldn't happen; lkb should have been removed from list when resid was zero */ log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id); - list_del_init(&lkb->lkb_astqueue); + list_del_init(&lkb->lkb_cb_list); spin_unlock(&proc->asts_spin); /* removes ref for proc->asts, may cause lkb to be freed */ dlm_put_lkb(lkb); goto try_another; } if (!resid) - list_del_init(&lkb->lkb_astqueue); + list_del_init(&lkb->lkb_cb_list); spin_unlock(&proc->asts_spin); if (cb.flags & DLM_CB_SKIP) { diff --git a/fs/exec.c b/fs/exec.c index ea5f748906a..6075a1e727a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1093,6 +1093,7 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ + set_fs(USER_DS); current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD); flush_thread(); current->personality &= ~bprm->per_clear; @@ -1357,10 +1358,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) if (retval) return retval; - /* kernel module loader fixup */ - /* so we don't try to load run modprobe in kernel space. */ - set_fs(USER_DS); - retval = audit_bprm(bprm); if (retval) return retval; @@ -1999,7 +1996,7 @@ static void wait_for_dump_helpers(struct file *file) * is a special value that we use to trap recursive * core dumps */ -static int umh_pipe_setup(struct subprocess_info *info) +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *rp, *wp; struct fdtable *fdt; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 06065bd37fc..c57beddcc21 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -913,7 +913,7 @@ struct dentry *exofs_get_parent(struct dentry *child) unsigned long ino = exofs_parent_ino(child); if (!ino) - return NULL; + return ERR_PTR(-ESTALE); return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino)); } diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 2e29abb30f7..095c36f3b61 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -125,7 +125,7 @@ struct ext4_ext_path { * positive retcode - signal for ext4_ext_walk_space(), see below * callback must return valid extent (passed or newly created) */ -typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, +typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, struct ext4_ext_cache *, struct ext4_extent *, void *); @@ -133,8 +133,11 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, #define EXT_BREAK 1 #define EXT_REPEAT 2 -/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */ -#define EXT_MAX_BLOCK 0xffffffff +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff /* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5199bac7fc6..f815cc81e7a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1408,7 +1408,7 @@ got_index: /* * ext4_ext_next_allocated_block: - * returns allocated block in subsequent extent or EXT_MAX_BLOCK. + * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. * NOTE: it considers block number from index entry as * allocated block. Thus, index entries have to be consistent * with leaves. @@ -1422,7 +1422,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) depth = path->p_depth; if (depth == 0 && path->p_ext == NULL) - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; while (depth >= 0) { if (depth == path->p_depth) { @@ -1439,12 +1439,12 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) depth--; } - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; } /* * ext4_ext_next_leaf_block: - * returns first allocated block from next leaf or EXT_MAX_BLOCK + * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, struct ext4_ext_path *path) @@ -1456,7 +1456,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, /* zero-tree has no leaf blocks at all */ if (depth == 0) - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; /* go to index block */ depth--; @@ -1469,7 +1469,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, depth--; } - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; } /* @@ -1677,13 +1677,13 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode, */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); - if (b2 == EXT_MAX_BLOCK) + if (b2 == EXT_MAX_BLOCKS) goto out; } /* check for wrap through zero on extent logical start block*/ if (b1 + len1 < b1) { - len1 = EXT_MAX_BLOCK - b1; + len1 = EXT_MAX_BLOCKS - b1; newext->ee_len = cpu_to_le16(len1); ret = 1; } @@ -1767,7 +1767,7 @@ repeat: fex = EXT_LAST_EXTENT(eh); next = ext4_ext_next_leaf_block(inode, path); if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) - && next != EXT_MAX_BLOCK) { + && next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %d\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); @@ -1887,7 +1887,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, BUG_ON(func == NULL); BUG_ON(inode == NULL); - while (block < last && block != EXT_MAX_BLOCK) { + while (block < last && block != EXT_MAX_BLOCKS) { num = last - block; /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); @@ -1958,7 +1958,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, err = -EIO; break; } - err = func(inode, path, &cbex, ex, cbdata); + err = func(inode, next, &cbex, ex, cbdata); ext4_ext_drop_refs(path); if (err < 0) @@ -2020,7 +2020,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ lblock = 0; - len = EXT_MAX_BLOCK; + len = EXT_MAX_BLOCKS; ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -2350,7 +2350,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * never happen because at least one of the end points * needs to be on the edge of the extent. */ - if (end == EXT_MAX_BLOCK) { + if (end == EXT_MAX_BLOCKS - 1) { ext_debug(" bad truncate %u:%u\n", start, end); block = 0; @@ -2398,7 +2398,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * If this is a truncate, this condition * should never happen */ - if (end == EXT_MAX_BLOCK) { + if (end == EXT_MAX_BLOCKS - 1) { ext_debug(" bad truncate %u:%u\n", start, end); err = -EIO; @@ -2478,7 +2478,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * we need to remove it from the leaf */ if (num == 0) { - if (end != EXT_MAX_BLOCK) { + if (end != EXT_MAX_BLOCKS - 1) { /* * For hole punching, we need to scoot all the * extents up when an extent is removed so that @@ -3699,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -3914,14 +3914,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, /* * Callback function called for each extent to gather FIEMAP information. */ -static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, +static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { __u64 logical; __u64 physical; __u64 length; - loff_t size; __u32 flags = 0; int ret = 0; struct fiemap_extent_info *fieinfo = data; @@ -4103,8 +4102,7 @@ found_delayed_extent: if (ex && ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; - size = i_size_read(inode); - if (logical + length >= size) + if (next == EXT_MAX_BLOCKS) flags |= FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent(fieinfo, logical, physical, @@ -4347,8 +4345,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, start_blk = start >> inode->i_sb->s_blocksize_bits; last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; - if (last_blk >= EXT_MAX_BLOCK) - last_blk = EXT_MAX_BLOCK-1; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a5763e3505b..e3126c05100 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2634,7 +2634,7 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; - trace_ext4_writepage(inode, page); + trace_ext4_writepage(page); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 859f2ae8864..6ed859d5685 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3578,8 +3578,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); - trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa, - grp_blk_start + bit, next - bit); + trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit, + next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3608,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(sb, pa); + trace_ext4_mb_release_group_pa(pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -4448,7 +4448,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * @inode: inode * @block: start physical block to free * @count: number of blocks to count - * @metadata: Are these metadata blocks + * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2b8304bf3c5..f57455a1b1b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1002,12 +1002,12 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if ((orig_start > EXT_MAX_BLOCK) || - (donor_start > EXT_MAX_BLOCK) || - (*len > EXT_MAX_BLOCK) || - (orig_start + *len > EXT_MAX_BLOCK)) { + if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || + (*len > EXT_MAX_BLOCKS) || + (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " - "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK, + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cc5c157aa11..9ea71aa864b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2243,6 +2243,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * + * However there is other limiting factor. We do store extents in the form + * of starting block and length, hence the resulting length of the extent + * covering maximum file size must fit into on-disk format containers as + * well. Given that length is always by 1 unit bigger than max unit (because + * we count 0 as well) we have to lower the s_maxbytes by one fs block. + * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) @@ -2264,10 +2270,13 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) upper_limit <<= blkbits; } - /* 32-bit extent-start container, ee_block */ - res = 1LL << 32; + /* + * 32-bit extent-start container, ee_block. We lower the maxbytes + * by one fs block, so ee_len can cover the extent of maximum file + * size + */ + res = (1LL << 32) - 1; res <<= blkbits; - res -= 1; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) diff --git a/fs/fat/file.c b/fs/fat/file.c index 7257752b6d5..7018e1d8902 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -102,7 +102,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) if (attr & ATTR_SYS) inode->i_flags |= S_IMMUTABLE; else - inode->i_flags &= S_IMMUTABLE; + inode->i_flags &= ~S_IMMUTABLE; } fat_save_attrs(inode, attr); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index a2a5d19ece6..3f7a59bfa7a 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -954,3 +954,43 @@ void fscache_mark_pages_cached(struct fscache_retrieval *op, pagevec_reinit(pagevec); } EXPORT_SYMBOL(fscache_mark_pages_cached); + +/* + * Uncache all the pages in an inode that are marked PG_fscache, assuming them + * to be associated with the given cookie. + */ +void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, + struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t next; + int i; + + _enter("%p,%p", cookie, inode); + + if (!mapping || mapping->nrpages == 0) { + _leave(" [no pages]"); + return; + } + + pagevec_init(&pvec, 0); + next = 0; + do { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + break; + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + next = page->index; + if (PageFsCache(page)) { + __fscache_wait_on_page_write(cookie, page); + __fscache_uncache_page(cookie, page); + } + } + pagevec_release(&pvec); + cond_resched(); + } while (++next); + + _leave(""); +} +EXPORT_SYMBOL(__fscache_uncache_all_inode_pages); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cc6ec4b2f0f..38f84cd48b6 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -921,6 +921,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; + sb->s_flags &= ~MS_NOSEC; + if (!parse_fuse_opt((char *) data, &d, is_bdev)) goto err; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 802ac5eeba2..f9fbbe96c22 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1069,6 +1069,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return 0; gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { if (atomic_read(&bh->b_count)) @@ -1080,6 +1081,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) goto not_possible; bh = bh->b_this_page; } while(bh != head); + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); head = bh = page_buffers(page); @@ -1112,6 +1114,7 @@ not_possible: /* Should never happen */ WARN_ON(buffer_dirty(bh)); WARN_ON(buffer_pinned(bh)); cannot_release: + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); return 0; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e65493a8ac0..42e477f3122 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -854,11 +854,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, blen++; else { if (bstart) { - if (metadata) - __gfs2_free_meta(ip, bstart, blen); - else - __gfs2_free_data(ip, bstart, blen); - + __gfs2_free_blocks(ip, bstart, blen, metadata); btotal += blen; } @@ -870,11 +866,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) { - if (metadata) - __gfs2_free_meta(ip, bstart, blen); - else - __gfs2_free_data(ip, bstart, blen); - + __gfs2_free_blocks(ip, bstart, blen, metadata); btotal += blen; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 091ee477953..1cc2f8ec52a 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -339,6 +339,67 @@ fail: return (copied) ? copied : error; } +/** + * gfs2_dir_get_hash_table - Get pointer to the dir hash table + * @ip: The inode in question + * + * Returns: The hash table or an error + */ + +static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + int ret; + u32 hsize; + __be64 *hc; + + BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH)); + + hc = ip->i_hash_cache; + if (hc) + return hc; + + hsize = 1 << ip->i_depth; + hsize *= sizeof(__be64); + if (hsize != i_size_read(&ip->i_inode)) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + hc = kmalloc(hsize, GFP_NOFS); + ret = -ENOMEM; + if (hc == NULL) + return ERR_PTR(-ENOMEM); + + ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1); + if (ret < 0) { + kfree(hc); + return ERR_PTR(ret); + } + + spin_lock(&inode->i_lock); + if (ip->i_hash_cache) + kfree(hc); + else + ip->i_hash_cache = hc; + spin_unlock(&inode->i_lock); + + return ip->i_hash_cache; +} + +/** + * gfs2_dir_hash_inval - Invalidate dir hash + * @ip: The directory inode + * + * Must be called with an exclusive glock, or during glock invalidation. + */ +void gfs2_dir_hash_inval(struct gfs2_inode *ip) +{ + __be64 *hc = ip->i_hash_cache; + ip->i_hash_cache = NULL; + kfree(hc); +} + static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) { return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0; @@ -686,17 +747,12 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) { - __be64 leaf_no; - int error; - - error = gfs2_dir_read_data(dip, (char *)&leaf_no, - index * sizeof(__be64), - sizeof(__be64), 0); - if (error != sizeof(u64)) - return (error < 0) ? error : -EIO; - - *leaf_out = be64_to_cpu(leaf_no); + __be64 *hash; + hash = gfs2_dir_get_hash_table(dip); + if (IS_ERR(hash)) + return PTR_ERR(hash); + *leaf_out = be64_to_cpu(*(hash + index)); return 0; } @@ -966,6 +1022,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) for (x = 0; x < half_len; x++) lp[x] = cpu_to_be64(bn); + gfs2_dir_hash_inval(dip); + error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64), half_len * sizeof(u64)); if (error != half_len * sizeof(u64)) { @@ -1052,70 +1110,54 @@ fail_brelse: static int dir_double_exhash(struct gfs2_inode *dip) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct buffer_head *dibh; u32 hsize; - u64 *buf; - u64 *from, *to; - u64 block; - u64 disksize = i_size_read(&dip->i_inode); + u32 hsize_bytes; + __be64 *hc; + __be64 *hc2, *h; int x; int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != disksize) { - gfs2_consist_inode(dip); - return -EIO; - } + hsize_bytes = hsize * sizeof(__be64); - /* Allocate both the "from" and "to" buffers in one big chunk */ + hc = gfs2_dir_get_hash_table(dip); + if (IS_ERR(hc)) + return PTR_ERR(hc); - buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS); - if (!buf) + h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS); + if (!hc2) return -ENOMEM; - for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) { - error = gfs2_dir_read_data(dip, (char *)buf, - block * sdp->sd_hash_bsize, - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto fail; - } - - from = buf; - to = (u64 *)((char *)buf + sdp->sd_hash_bsize); - - for (x = sdp->sd_hash_ptrs; x--; from++) { - *to++ = *from; /* No endianess worries */ - *to++ = *from; - } + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_kfree; - error = gfs2_dir_write_data(dip, - (char *)buf + sdp->sd_hash_bsize, - block * sdp->sd_sb.sb_bsize, - sdp->sd_sb.sb_bsize); - if (error != sdp->sd_sb.sb_bsize) { - if (error >= 0) - error = -EIO; - goto fail; - } + for (x = 0; x < hsize; x++) { + *h++ = *hc; + *h++ = *hc; + hc++; } - kfree(buf); - - error = gfs2_meta_inode_buffer(dip, &dibh); - if (!gfs2_assert_withdraw(sdp, !error)) { - dip->i_depth++; - gfs2_dinode_out(dip, dibh->b_data); - brelse(dibh); - } + error = gfs2_dir_write_data(dip, (char *)hc2, 0, hsize_bytes * 2); + if (error != (hsize_bytes * 2)) + goto fail; - return error; + gfs2_dir_hash_inval(dip); + dip->i_hash_cache = hc2; + dip->i_depth++; + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + return 0; fail: - kfree(buf); + /* Replace original hash table & size */ + gfs2_dir_write_data(dip, (char *)hc, 0, hsize_bytes); + i_size_write(&dip->i_inode, hsize_bytes); + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); +out_kfree: + kfree(hc2); return error; } @@ -1348,6 +1390,7 @@ out: return error; } + /** * dir_e_read - Reads the entries from a directory into a filldir buffer * @dip: dinode pointer @@ -1362,9 +1405,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir) { struct gfs2_inode *dip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); u32 hsize, len = 0; - u32 ht_offset, lp_offset, ht_offset_cur = -1; u32 hash, index; __be64 *lp; int copied = 0; @@ -1372,37 +1413,17 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, unsigned depth = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != i_size_read(inode)) { - gfs2_consist_inode(dip); - return -EIO; - } - hash = gfs2_dir_offset2hash(*offset); index = hash >> (32 - dip->i_depth); - lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); - if (!lp) - return -ENOMEM; + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); while (index < hsize) { - lp_offset = index & (sdp->sd_hash_ptrs - 1); - ht_offset = index - lp_offset; - - if (ht_offset_cur != ht_offset) { - error = gfs2_dir_read_data(dip, (char *)lp, - ht_offset * sizeof(__be64), - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto out; - } - ht_offset_cur = ht_offset; - } - error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, &copied, &depth, - be64_to_cpu(lp[lp_offset])); + be64_to_cpu(lp[index])); if (error) break; @@ -1410,8 +1431,6 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, index = (index & ~(len - 1)) + len; } -out: - kfree(lp); if (error > 0) error = 0; return error; @@ -1914,43 +1933,22 @@ out: int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct buffer_head *bh; struct gfs2_leaf *leaf; u32 hsize, len; - u32 ht_offset, lp_offset, ht_offset_cur = -1; u32 index = 0, next_index; __be64 *lp; u64 leaf_no; int error = 0, last; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) { - gfs2_consist_inode(dip); - return -EIO; - } - lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); - if (!lp) - return -ENOMEM; + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); while (index < hsize) { - lp_offset = index & (sdp->sd_hash_ptrs - 1); - ht_offset = index - lp_offset; - - if (ht_offset_cur != ht_offset) { - error = gfs2_dir_read_data(dip, (char *)lp, - ht_offset * sizeof(__be64), - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto out; - } - ht_offset_cur = ht_offset; - } - - leaf_no = be64_to_cpu(lp[lp_offset]); + leaf_no = be64_to_cpu(lp[index]); if (leaf_no) { error = get_leaf(dip, leaf_no, &bh); if (error) @@ -1976,7 +1974,6 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) } out: - kfree(lp); return error; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index e686af11bec..ff5772fbf02 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -35,6 +35,7 @@ extern int gfs2_diradd_alloc_required(struct inode *dir, const struct qstr *filename); extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head **bhp); +extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); static inline u32 gfs2_disk_hash(const char *data, int len) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a9f5cbe45cd..bc2590ef5fc 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -174,7 +174,9 @@ void gfs2_set_inode_flags(struct inode *inode) struct gfs2_inode *ip = GFS2_I(inode); unsigned int flags = inode->i_flags; - flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); + if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) + inode->i_flags |= S_NOSEC; if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) flags |= S_IMMUTABLE; if (ip->i_diskflags & GFS2_DIF_APPENDONLY) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 2792a790e50..88e8a23d002 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -409,6 +409,10 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) if (held1 && held2 && list_empty(&gl->gl_holders)) clear_bit(GLF_QUEUED, &gl->gl_flags); + if (new_state != gl->gl_target) + /* shorten our minimum hold time */ + gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, + GL_GLOCK_MIN_HOLD); gl->gl_state = new_state; gl->gl_tchange = jiffies; } @@ -663,20 +667,30 @@ static void glock_work_func(struct work_struct *work) drop_ref = 1; } spin_lock(&gl->gl_spin); - if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { unsigned long holdtime, now = jiffies; - holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; + + holdtime = gl->gl_tchange + gl->gl_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; - set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags); + + if (!delay) { + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + set_bit(GLF_DEMOTE, &gl->gl_flags); + } } run_queue(gl, 0); spin_unlock(&gl->gl_spin); - if (!delay || - queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + if (!delay) gfs2_glock_put(gl); + else { + if (gl->gl_name.ln_type != LM_TYPE_INODE) + delay = 0; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); + } if (drop_ref) gfs2_glock_put(gl); } @@ -738,6 +752,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_sbd = sdp; + gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); INIT_WORK(&gl->gl_delete, delete_work_func); @@ -850,8 +865,15 @@ static int gfs2_glock_demote_wait(void *word) static void wait_on_holder(struct gfs2_holder *gh) { + unsigned long time1 = jiffies; + might_sleep(); wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); + if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ + /* Lengthen the minimum hold time. */ + gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + + GL_GLOCK_HOLD_INCR, + GL_GLOCK_MAX_HOLD); } static void wait_on_demote(struct gfs2_glock *gl) @@ -1088,8 +1110,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh) gfs2_glock_hold(gl); if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags)) - delay = gl->gl_ops->go_min_hold_time; + !test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) + delay = gl->gl_hold_time; if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); } @@ -1268,12 +1291,13 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) unsigned long now = jiffies; gfs2_glock_hold(gl); - holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; - if (test_bit(GLF_QUEUED, &gl->gl_flags)) { + holdtime = gl->gl_tchange + gl->gl_hold_time; + if (test_bit(GLF_QUEUED, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) { if (time_before(now, holdtime)) delay = holdtime - now; if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) - delay = gl->gl_ops->go_min_hold_time; + delay = gl->gl_hold_time; } spin_lock(&gl->gl_spin); @@ -1662,7 +1686,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) dtime *= 1000000/HZ; /* demote time in uSec */ if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; - gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d\n", + gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld\n", state2str(gl->gl_state), gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, @@ -1671,7 +1695,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), atomic_read(&gl->gl_revokes), - atomic_read(&gl->gl_ref)); + atomic_read(&gl->gl_ref), gl->gl_hold_time); list_for_each_entry(gh, &gl->gl_holders, gh_list) { error = dump_holder(seq, gh); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 6b2f757b928..66707118af2 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -113,6 +113,12 @@ enum { #define GLR_TRYFAILED 13 +#define GL_GLOCK_MAX_HOLD (long)(HZ / 5) +#define GL_GLOCK_DFT_HOLD (long)(HZ / 5) +#define GL_GLOCK_MIN_HOLD (long)(10) +#define GL_GLOCK_HOLD_INCR (long)(HZ / 20) +#define GL_GLOCK_HOLD_DECR (long)(HZ / 40) + struct lm_lockops { const char *lm_proto_name; int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 8ef70f46473..da21ecaafcc 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -26,6 +26,7 @@ #include "rgrp.h" #include "util.h" #include "trans.h" +#include "dir.h" /** * __gfs2_ail_flush - remove all buffers for a given lock from the AIL @@ -47,10 +48,10 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl) bd_ail_gl_list); bh = bd->bd_bh; gfs2_remove_from_ail(bd); - spin_unlock(&sdp->sd_ail_lock); - bd->bd_bh = NULL; bh->b_private = NULL; + spin_unlock(&sdp->sd_ail_lock); + bd->bd_blkno = bh->b_blocknr; gfs2_log_lock(sdp); gfs2_assert_withdraw(sdp, !buffer_busy(bh)); @@ -218,11 +219,14 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) if (ip) { set_bit(GIF_INVALID, &ip->i_flags); forget_all_cached_acls(&ip->i_inode); + gfs2_dir_hash_inval(ip); } } - if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) + if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) { + gfs2_log_flush(gl->gl_sbd, NULL); gl->gl_sbd->sd_rindex_uptodate = 0; + } if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); } @@ -314,6 +318,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_generation = be64_to_cpu(str->di_generation); ip->i_diskflags = be32_to_cpu(str->di_flags); + ip->i_eattr = be64_to_cpu(str->di_eattr); + /* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */ gfs2_set_inode_flags(&ip->i_inode); height = be16_to_cpu(str->di_height); if (unlikely(height > GFS2_MAX_META_HEIGHT)) @@ -326,7 +332,6 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_depth = (u8)depth; ip->i_entries = be32_to_cpu(str->di_entries); - ip->i_eattr = be64_to_cpu(str->di_eattr); if (S_ISREG(ip->i_inode.i_mode)) gfs2_set_aops(&ip->i_inode); @@ -547,7 +552,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_lock = inode_go_lock, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, - .go_min_hold_time = HZ / 5, .go_flags = GLOF_ASPACE, }; @@ -558,7 +562,6 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_unlock = rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, - .go_min_hold_time = HZ / 5, .go_flags = GLOF_ASPACE, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0a064e91ac7..892ac37de8a 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -17,6 +17,7 @@ #include <linux/buffer_head.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> +#include <linux/completion.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -162,7 +163,6 @@ struct gfs2_glock_operations { int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); void (*go_callback) (struct gfs2_glock *gl); const int go_type; - const unsigned long go_min_hold_time; const unsigned long go_flags; #define GLOF_ASPACE 1 }; @@ -220,6 +220,7 @@ struct gfs2_glock { unsigned int gl_hash; unsigned long gl_demote_time; /* time of first demote request */ + long gl_hold_time; struct list_head gl_holders; const struct gfs2_glock_operations *gl_ops; @@ -284,6 +285,7 @@ struct gfs2_inode { u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; struct list_head i_trunc_list; + __be64 *i_hash_cache; u32 i_entries; u32 i_diskflags; u8 i_height; @@ -546,6 +548,7 @@ struct gfs2_sbd { struct gfs2_glock *sd_trans_gl; wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; + struct completion sd_locking_init; /* Inode Stuff */ diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 903115f2bb3..85c62923ee2 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -903,6 +903,7 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) if (gfs2_ail1_empty(sdp)) break; } + gfs2_log_flush(sdp, NULL); } static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index c2b34cd2abe..29e1ace7953 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -41,6 +41,7 @@ static void gfs2_init_inode_once(void *foo) init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); ip->i_alloc = NULL; + ip->i_hash_cache = NULL; } static void gfs2_init_glock_once(void *foo) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 8ac9ae189b5..516516e0c2a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -72,6 +72,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_glock_wait); atomic_set(&sdp->sd_glock_disposal, 0); + init_completion(&sdp->sd_locking_init); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -1017,11 +1018,13 @@ hostdata_error: fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); + complete(&sdp->sd_locking_init); return 0; } ret = lm->lm_mount(sdp, fsname); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + complete(&sdp->sd_locking_init); return ret; } @@ -1091,6 +1094,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (sdp->sd_args.ar_nobarrier) set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + sb->s_flags |= MS_NOSEC; sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_d_op = &gfs2_dops; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9b780df3fd5..7f8af1eb02d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1607,14 +1607,15 @@ rgrp_error: } /** - * gfs2_free_data - free a contiguous run of data block(s) + * __gfs2_free_blocks - free a contiguous run of block(s) * @ip: the inode these blocks are being freed from * @bstart: first block of a run of contiguous blocks * @blen: the length of the block run + * @meta: 1 if the blocks represent metadata * */ -void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; @@ -1631,54 +1632,11 @@ void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) gfs2_trans_add_rg(rgd); /* Directories keep their data in the metadata address space */ - if (ip->i_depth) + if (meta || ip->i_depth) gfs2_meta_wipe(ip, bstart, blen); } /** - * gfs2_free_data - free a contiguous run of data block(s) - * @ip: the inode these blocks are being freed from - * @bstart: first block of a run of contiguous blocks - * @blen: the length of the block run - * - */ - -void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - - __gfs2_free_data(ip, bstart, blen); - gfs2_statfs_change(sdp, 0, +blen, 0); - gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); -} - -/** - * gfs2_free_meta - free a contiguous run of data block(s) - * @ip: the inode these blocks are being freed from - * @bstart: first block of a run of contiguous blocks - * @blen: the length of the block run - * - */ - -void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrpd *rgd; - - rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); - if (!rgd) - return; - trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE); - rgd->rd_free += blen; - - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - - gfs2_trans_add_rg(rgd); - gfs2_meta_wipe(ip, bstart, blen); -} - -/** * gfs2_free_meta - free a contiguous run of data block(s) * @ip: the inode these blocks are being freed from * @bstart: first block of a run of contiguous blocks @@ -1690,7 +1648,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - __gfs2_free_meta(ip, bstart, blen); + __gfs2_free_blocks(ip, bstart, blen, 1); gfs2_statfs_change(sdp, 0, +blen, 0); gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index a80e3034ac4..d253f9a8c70 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -52,9 +52,7 @@ extern int gfs2_ri_update(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); -extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); -extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); -extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); extern void gfs2_unlink_di(struct inode *inode); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ed540e7018b..b7beadd9ba4 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -757,13 +757,17 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct timespec atime; struct gfs2_dinode *di; int ret = -EAGAIN; + int unlock_required = 0; /* Skip timestamp update, if this is from a memalloc */ if (current->flags & PF_MEMALLOC) goto do_flush; - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - goto do_flush; + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto do_flush; + unlock_required = 1; + } ret = gfs2_trans_begin(sdp, RES_DINODE, 0); if (ret) goto do_unlock; @@ -780,7 +784,8 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) } gfs2_trans_end(sdp); do_unlock: - gfs2_glock_dq_uninit(&gh); + if (unlock_required) + gfs2_glock_dq_uninit(&gh); do_flush: if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); @@ -1427,7 +1432,20 @@ out: return error; } -/* +/** + * gfs2_evict_inode - Remove an inode from cache + * @inode: The inode to evict + * + * There are three cases to consider: + * 1. i_nlink == 0, we are final opener (and must deallocate) + * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) + * 3. i_nlink > 0 + * + * If the fs is read only, then we have to treat all cases as per #3 + * since we are unable to do any deallocation. The inode will be + * deallocated by the next read/write node to attempt an allocation + * in the same resource group + * * We have to (at the moment) hold the inodes main lock to cover * the gap between unlocking the shared lock on the iopen lock and * taking the exclusive lock. I'd rather do a shared -> exclusive @@ -1470,6 +1488,8 @@ static void gfs2_evict_inode(struct inode *inode) if (error) goto out_truncate; + /* Case 1 starts here */ + if (S_ISDIR(inode->i_mode) && (ip->i_diskflags & GFS2_DIF_EXHASH)) { error = gfs2_dir_exhash_dealloc(ip); @@ -1493,13 +1513,16 @@ static void gfs2_evict_inode(struct inode *inode) goto out_unlock; out_truncate: + /* Case 2 starts here */ error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) goto out_unlock; - gfs2_final_release_pages(ip); + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); gfs2_trans_end(sdp); out_unlock: + /* Error path for case 1 */ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) gfs2_glock_dq(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); @@ -1507,9 +1530,10 @@ out_unlock: if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: + /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); - + gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index e20eab37bc8..443cabcfcd2 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -338,6 +338,9 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = sscanf(buf, "%u", &first); if (rv != 1 || first > 1) return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; spin_lock(&sdp->sd_jindex_spin); rv = -EBUSY; if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) @@ -414,7 +417,9 @@ static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = sscanf(buf, "%d", &jid); if (rv != 1) return -EINVAL; - + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; spin_lock(&sdp->sd_jindex_spin); rv = -EINVAL; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 2312de34bd4..2a734cfccc9 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -43,6 +43,10 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) node->tree->node_size - (rec + 1) * 2); if (!recoff) return 0; + if (recoff > node->tree->node_size - 2) { + printk(KERN_ERR "hfs: recoff %d too large\n", recoff); + return 0; + } retval = hfs_bnode_read_u16(node, recoff) + 2; if (retval > node->tree->max_key_len + 2) { diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index b4ba1b31933..4dfbfec357e 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -212,7 +212,9 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); entry_size = hfsplus_fill_cat_thread(sb, &entry, @@ -269,7 +271,9 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return err; if (!str) { int len; @@ -347,12 +351,14 @@ int hfsplus_rename_cat(u32 cnid, struct hfs_find_data src_fd, dst_fd; hfsplus_cat_entry entry; int entry_size, type; - int err = 0; + int err; dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); + if (err) + return err; dst_fd = src_fd; /* find the old dir entry and read the data */ diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 4df5059c25d..25b2443a004 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -38,7 +38,9 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, sb = dir->i_sb; dentry->d_fsdata = NULL; - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return ERR_PTR(err); hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); again: err = hfs_brec_read(&fd, &entry, sizeof(entry)); @@ -132,7 +134,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filp->f_pos >= inode->i_size) return 0; - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index b1991a2a08e..5849e3ef35c 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -119,22 +119,31 @@ static void __hfsplus_ext_write_extent(struct inode *inode, set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags); } -static void hfsplus_ext_write_extent_locked(struct inode *inode) +static int hfsplus_ext_write_extent_locked(struct inode *inode) { + int res; + if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) { struct hfs_find_data fd; - hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + if (res) + return res; __hfsplus_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } + return 0; } -void hfsplus_ext_write_extent(struct inode *inode) +int hfsplus_ext_write_extent(struct inode *inode) { + int res; + mutex_lock(&HFSPLUS_I(inode)->extents_lock); - hfsplus_ext_write_extent_locked(inode); + res = hfsplus_ext_write_extent_locked(inode); mutex_unlock(&HFSPLUS_I(inode)->extents_lock); + + return res; } static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, @@ -194,9 +203,11 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block) block < hip->cached_start + hip->cached_blocks) return 0; - hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); - res = __hfsplus_ext_cache_extent(&fd, inode, block); - hfs_find_exit(&fd); + res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); + if (!res) { + res = __hfsplus_ext_cache_extent(&fd, inode, block); + hfs_find_exit(&fd); + } return res; } @@ -209,6 +220,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, struct hfsplus_inode_info *hip = HFSPLUS_I(inode); int res = -EIO; u32 ablock, dblock, mask; + sector_t sector; int was_dirty = 0; int shift; @@ -255,10 +267,12 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, done: dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); + mask = (1 << sbi->fs_shift) - 1; - map_bh(bh_result, sb, - (dblock << sbi->fs_shift) + sbi->blockoffset + - (iblock & mask)); + sector = ((sector_t)dblock << sbi->fs_shift) + + sbi->blockoffset + (iblock & mask); + map_bh(bh_result, sb, sector); + if (create) { set_buffer_new(bh_result); hip->phys_size += sb->s_blocksize; @@ -371,7 +385,9 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, if (total_blocks == blocks) return 0; - hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + if (res) + return res; do { res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, total_blocks, type); @@ -469,7 +485,9 @@ out: insert_extent: dprint(DBG_EXTENT, "insert new extent\n"); - hfsplus_ext_write_extent_locked(inode); + res = hfsplus_ext_write_extent_locked(inode); + if (res) + goto out; memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); hip->cached_extents[0].start_block = cpu_to_be32(start); @@ -500,7 +518,6 @@ void hfsplus_file_truncate(struct inode *inode) struct page *page; void *fsdata; u32 size = inode->i_size; - int res; res = pagecache_write_begin(NULL, mapping, size, 0, AOP_FLAG_UNINTERRUPTIBLE, @@ -523,7 +540,12 @@ void hfsplus_file_truncate(struct inode *inode) goto out; mutex_lock(&hip->extents_lock); - hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); + if (res) { + mutex_unlock(&hip->extents_lock); + /* XXX: We lack error handling of hfsplus_file_truncate() */ + return; + } while (1) { if (alloc_cnt == hip->first_blocks) { hfsplus_free_extents(sb, hip->first_extents, diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index d6857523336..81dfd1e495e 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -13,6 +13,7 @@ #include <linux/fs.h> #include <linux/mutex.h> #include <linux/buffer_head.h> +#include <linux/blkdev.h> #include "hfsplus_raw.h" #define DBG_BNODE_REFS 0x00000001 @@ -110,7 +111,9 @@ struct hfsplus_vh; struct hfs_btree; struct hfsplus_sb_info { + void *s_vhdr_buf; struct hfsplus_vh *s_vhdr; + void *s_backup_vhdr_buf; struct hfsplus_vh *s_backup_vhdr; struct hfs_btree *ext_tree; struct hfs_btree *cat_tree; @@ -258,6 +261,15 @@ struct hfsplus_readdir_data { struct hfsplus_cat_key key; }; +/* + * Find minimum acceptible I/O size for an hfsplus sb. + */ +static inline unsigned short hfsplus_min_io_size(struct super_block *sb) +{ + return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + HFSPLUS_SECTOR_SIZE); +} + #define hfs_btree_open hfsplus_btree_open #define hfs_btree_close hfsplus_btree_close #define hfs_btree_write hfsplus_btree_write @@ -374,7 +386,7 @@ extern const struct file_operations hfsplus_dir_operations; /* extents.c */ int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); -void hfsplus_ext_write_extent(struct inode *); +int hfsplus_ext_write_extent(struct inode *); int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int); int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int); @@ -436,8 +448,8 @@ int hfsplus_compare_dentry(const struct dentry *parent, /* wrapper.c */ int hfsplus_read_wrapper(struct super_block *); int hfs_part_find(struct super_block *, sector_t *, sector_t *); -int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, - void *data, int rw); +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + void *buf, void **data, int rw); /* time macros */ #define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b248a6cfcad..010cd363d08 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -195,11 +195,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, hip->flags = 0; set_bit(HFSPLUS_I_RSRC, &hip->flags); - hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); - err = hfsplus_find_cat(sb, dir->i_ino, &fd); - if (!err) - err = hfsplus_cat_read_inode(inode, &fd); - hfs_find_exit(&fd); + err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); + if (!err) { + err = hfsplus_find_cat(sb, dir->i_ino, &fd); + if (!err) + err = hfsplus_cat_read_inode(inode, &fd); + hfs_find_exit(&fd); + } if (err) { iput(inode); return ERR_PTR(err); diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index 40ad88c12c6..eb355d81e27 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -88,11 +88,12 @@ static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm, return -ENOENT; } -static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, - sector_t *part_start, sector_t *part_size) +static int hfs_parse_new_pmap(struct super_block *sb, void *buf, + struct new_pmap *pm, sector_t *part_start, sector_t *part_size) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); int size = be32_to_cpu(pm->pmMapBlkCnt); + int buf_size = hfsplus_min_io_size(sb); int res; int i = 0; @@ -107,11 +108,14 @@ static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, if (++i >= size) return -ENOENT; - res = hfsplus_submit_bio(sb->s_bdev, - *part_start + HFS_PMAP_BLK + i, - pm, READ); - if (res) - return res; + pm = (struct new_pmap *)((u8 *)pm + HFSPLUS_SECTOR_SIZE); + if ((u8 *)pm - (u8 *)buf >= buf_size) { + res = hfsplus_submit_bio(sb, + *part_start + HFS_PMAP_BLK + i, + buf, (void **)&pm, READ); + if (res) + return res; + } } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC)); return -ENOENT; @@ -124,15 +128,15 @@ static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, int hfs_part_find(struct super_block *sb, sector_t *part_start, sector_t *part_size) { - void *data; + void *buf, *data; int res; - data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); - if (!data) + buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!buf) return -ENOMEM; - res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK, - data, READ); + res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, + buf, &data, READ); if (res) goto out; @@ -141,13 +145,13 @@ int hfs_part_find(struct super_block *sb, res = hfs_parse_old_pmap(sb, data, part_start, part_size); break; case HFS_NEW_PMAP_MAGIC: - res = hfs_parse_new_pmap(sb, data, part_start, part_size); + res = hfs_parse_new_pmap(sb, buf, data, part_start, part_size); break; default: res = -ENOENT; break; } out: - kfree(data); + kfree(buf); return res; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index b49b55584c8..c106ca22e81 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -73,11 +73,13 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || inode->i_ino == HFSPLUS_ROOT_CNID) { - hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); - err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); - if (!err) - err = hfsplus_cat_read_inode(inode, &fd); - hfs_find_exit(&fd); + err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); + if (!err) { + err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); + if (!err) + err = hfsplus_cat_read_inode(inode, &fd); + hfs_find_exit(&fd); + } } else { err = hfsplus_system_read_inode(inode); } @@ -133,9 +135,13 @@ static int hfsplus_system_write_inode(struct inode *inode) static int hfsplus_write_inode(struct inode *inode, struct writeback_control *wbc) { + int err; + dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); - hfsplus_ext_write_extent(inode); + err = hfsplus_ext_write_extent(inode); + if (err) + return err; if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || inode->i_ino == HFSPLUS_ROOT_CNID) @@ -197,17 +203,17 @@ int hfsplus_sync_fs(struct super_block *sb, int wait) write_backup = 1; } - error2 = hfsplus_submit_bio(sb->s_bdev, + error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr, WRITE_SYNC); + sbi->s_vhdr_buf, NULL, WRITE_SYNC); if (!error) error = error2; if (!write_backup) goto out; - error2 = hfsplus_submit_bio(sb->s_bdev, + error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, - sbi->s_backup_vhdr, WRITE_SYNC); + sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC); if (!error) error2 = error; out: @@ -251,8 +257,8 @@ static void hfsplus_put_super(struct super_block *sb) hfs_btree_close(sbi->ext_tree); iput(sbi->alloc_file); iput(sbi->hidden_dir); - kfree(sbi->s_vhdr); - kfree(sbi->s_backup_vhdr); + kfree(sbi->s_vhdr_buf); + kfree(sbi->s_backup_vhdr_buf); unload_nls(sbi->nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; @@ -393,6 +399,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->rsrc_clump_blocks) sbi->rsrc_clump_blocks = 1; + err = generic_check_addressable(sbi->alloc_blksz_shift, + sbi->total_blocks); + if (err) { + printk(KERN_ERR "hfs: filesystem size too large.\n"); + goto out_free_vhdr; + } + /* Set up operations so we can load metadata */ sb->s_op = &hfsplus_sops; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -417,6 +430,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= MS_RDONLY; } + err = -EINVAL; + /* Load metadata objects (B*Trees) */ sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); if (!sbi->ext_tree) { @@ -447,7 +462,9 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; str.name = HFSP_HIDDENDIR_NAME; - hfs_find_init(sbi->cat_tree, &fd); + err = hfs_find_init(sbi->cat_tree, &fd); + if (err) + goto out_put_root; hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); @@ -500,7 +517,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) out_put_hidden_dir: iput(sbi->hidden_dir); out_put_root: - iput(sbi->alloc_file); + iput(root); out_put_alloc_file: iput(sbi->alloc_file); out_close_cat_tree: diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index a3f0bfcc881..a32998f29f0 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -142,7 +142,11 @@ int hfsplus_uni2asc(struct super_block *sb, /* search for single decomposed char */ if (likely(compose)) ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0); - if (ce1 && (cc = ce1[0])) { + if (ce1) + cc = ce1[0]; + else + cc = 0; + if (cc) { /* start of a possibly decomposed Hangul char */ if (cc != 0xffff) goto done; @@ -209,7 +213,8 @@ int hfsplus_uni2asc(struct super_block *sb, i++; ce2 = ce1; } - if ((cc = ce2[0])) { + cc = ce2[0]; + if (cc) { ip += i; ustrlen -= i; goto done; @@ -301,7 +306,11 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { size = asc2unichar(sb, astr, len, &c); - if (decompose && (dstr = decompose_unichar(c, &dsize))) { + if (decompose) + dstr = decompose_unichar(c, &dsize); + else + dstr = NULL; + if (dstr) { if (outlen + dsize > HFSPLUS_MAX_STRLEN) break; do { @@ -346,15 +355,23 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, astr += size; len -= size; - if (decompose && (dstr = decompose_unichar(c, &dsize))) { + if (decompose) + dstr = decompose_unichar(c, &dsize); + else + dstr = NULL; + if (dstr) { do { c2 = *dstr++; - if (!casefold || (c2 = case_fold(c2))) + if (casefold) + c2 = case_fold(c2); + if (!casefold || c2) hash = partial_name_hash(c2, hash); } while (--dsize > 0); } else { c2 = c; - if (!casefold || (c2 = case_fold(c2))) + if (casefold) + c2 = case_fold(c2); + if (!casefold || c2) hash = partial_name_hash(c2, hash); } } @@ -422,12 +439,14 @@ int hfsplus_compare_dentry(const struct dentry *parent, c1 = *dstr1; c2 = *dstr2; if (casefold) { - if (!(c1 = case_fold(c1))) { + c1 = case_fold(c1); + if (!c1) { dstr1++; dsize1--; continue; } - if (!(c2 = case_fold(c2))) { + c2 = case_fold(c2); + if (!c2) { dstr2++; dsize2--; continue; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 3031d81f5f0..10e515a0d45 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -31,31 +31,77 @@ static void hfsplus_end_io_sync(struct bio *bio, int err) complete(bio->bi_private); } -int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, - void *data, int rw) +/* + * hfsplus_submit_bio - Perfrom block I/O + * @sb: super block of volume for I/O + * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes + * @buf: buffer for I/O + * @data: output pointer for location of requested data + * @rw: direction of I/O + * + * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than + * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads + * @data will return a pointer to the start of the requested sector, + * which may not be the same location as @buf. + * + * If @sector is not aligned to the bdev logical block size it will + * be rounded down. For writes this means that @buf should contain data + * that starts at the rounded-down address. As long as the data was + * read using hfsplus_submit_bio() and the same buffer is used things + * will work correctly. + */ +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + void *buf, void **data, int rw) { DECLARE_COMPLETION_ONSTACK(wait); struct bio *bio; + int ret = 0; + unsigned int io_size; + loff_t start; + int offset; + + /* + * Align sector to hardware sector size and find offset. We + * assume that io_size is a power of two, which _should_ + * be true. + */ + io_size = hfsplus_min_io_size(sb); + start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT; + offset = start & (io_size - 1); + sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); bio = bio_alloc(GFP_NOIO, 1); bio->bi_sector = sector; - bio->bi_bdev = bdev; + bio->bi_bdev = sb->s_bdev; bio->bi_end_io = hfsplus_end_io_sync; bio->bi_private = &wait; - /* - * We always submit one sector at a time, so bio_add_page must not fail. - */ - if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE, - offset_in_page(data)) != HFSPLUS_SECTOR_SIZE) - BUG(); + if (!(rw & WRITE) && data) + *data = (u8 *)buf + offset; + + while (io_size > 0) { + unsigned int page_offset = offset_in_page(buf); + unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset, + io_size); + + ret = bio_add_page(bio, virt_to_page(buf), len, page_offset); + if (ret != len) { + ret = -EIO; + goto out; + } + io_size -= len; + buf = (u8 *)buf + len; + } submit_bio(rw, bio); wait_for_completion(&wait); if (!bio_flagged(bio, BIO_UPTODATE)) - return -EIO; - return 0; + ret = -EIO; + +out: + bio_put(bio); + return ret < 0 ? ret : 0; } static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) @@ -138,23 +184,19 @@ int hfsplus_read_wrapper(struct super_block *sb) if (hfsplus_get_last_session(sb, &part_start, &part_size)) goto out; - if ((u64)part_start + part_size > 0x100000000ULL) { - pr_err("hfs: volumes larger than 2TB are not supported yet\n"); - goto out; - } error = -ENOMEM; - sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); - if (!sbi->s_vhdr) + sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_vhdr_buf) goto out; - sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); - if (!sbi->s_backup_vhdr) + sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_backup_vhdr_buf) goto out_free_vhdr; reread: - error = hfsplus_submit_bio(sb->s_bdev, - part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr, READ); + error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, + sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, + READ); if (error) goto out_free_backup_vhdr; @@ -169,8 +211,9 @@ reread: if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) goto out_free_backup_vhdr; wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; - part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; - part_size = wd.embed_count * wd.ablk_size; + part_start += (sector_t)wd.ablk_start + + (sector_t)wd.embed_start * wd.ablk_size; + part_size = (sector_t)wd.embed_count * wd.ablk_size; goto reread; default: /* @@ -183,9 +226,9 @@ reread: goto reread; } - error = hfsplus_submit_bio(sb->s_bdev, - part_start + part_size - 2, - sbi->s_backup_vhdr, READ); + error = hfsplus_submit_bio(sb, part_start + part_size - 2, + sbi->s_backup_vhdr_buf, + (void **)&sbi->s_backup_vhdr, READ); if (error) goto out_free_backup_vhdr; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 87ed48e0343..85c098a499f 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -139,7 +139,8 @@ static int file_removed(struct dentry *dentry, const char *file) static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, struct nameidata *nd) { - struct dentry *proc_dentry, *new, *parent; + struct dentry *proc_dentry, *parent; + struct qstr *name = &dentry->d_name; struct inode *inode; int err, deleted; @@ -149,23 +150,9 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, else if (deleted) return ERR_PTR(-ENOENT); - err = -ENOMEM; parent = HPPFS_I(ino)->proc_dentry; mutex_lock(&parent->d_inode->i_mutex); - proc_dentry = d_lookup(parent, &dentry->d_name); - if (proc_dentry == NULL) { - proc_dentry = d_alloc(parent, &dentry->d_name); - if (proc_dentry == NULL) { - mutex_unlock(&parent->d_inode->i_mutex); - goto out; - } - new = (*parent->d_inode->i_op->lookup)(parent->d_inode, - proc_dentry, NULL); - if (new) { - dput(proc_dentry); - proc_dentry = new; - } - } + proc_dentry = lookup_one_len(name->name, parent, name->len); mutex_unlock(&parent->d_inode->i_mutex); if (IS_ERR(proc_dentry)) @@ -174,13 +161,11 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, err = -ENOMEM; inode = get_inode(ino->i_sb, proc_dentry); if (!inode) - goto out_dput; + goto out; d_add(dentry, inode); return NULL; - out_dput: - dput(proc_dentry); out: return ERR_PTR(err); } @@ -690,8 +675,10 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) struct inode *proc_ino = dentry->d_inode; struct inode *inode = new_inode(sb); - if (!inode) + if (!inode) { + dput(dentry); return ERR_PTR(-ENOMEM); + } if (S_ISDIR(dentry->d_inode->i_mode)) { inode->i_op = &hppfs_dir_iops; @@ -704,7 +691,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) inode->i_fop = &hppfs_file_fops; } - HPPFS_I(inode)->proc_dentry = dget(dentry); + HPPFS_I(inode)->proc_dentry = dentry; inode->i_uid = proc_ino->i_uid; inode->i_gid = proc_ino->i_gid; @@ -737,7 +724,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) sb->s_fs_info = proc_mnt; err = -ENOMEM; - root_inode = get_inode(sb, proc_mnt->mnt_sb->s_root); + root_inode = get_inode(sb, dget(proc_mnt->mnt_sb->s_root)); if (!root_inode) goto out_mntput; diff --git a/fs/inode.c b/fs/inode.c index 0f7e88a7803..43566d17d1b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -423,7 +423,14 @@ EXPORT_SYMBOL(remove_inode_hash); void end_writeback(struct inode *inode) { might_sleep(); + /* + * We have to cycle tree_lock here because reclaim can be still in the + * process of removing the last page (in __delete_from_page_cache()) + * and we must not free mapping under it. + */ + spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); + spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 3db5ba4568f..b3cc8586984 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -974,7 +974,7 @@ out_no_inode: out_no_read: printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", __func__, s->s_id, iso_blknum, block); - goto out_freesbi; + goto out_freebh; out_bad_zone_size: printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", sbi->s_log_zone_size); @@ -989,6 +989,7 @@ out_unknown_format: out_freebh: brelse(bh); + brelse(pri_bh); out_freesbi: kfree(opt.iocharset); kfree(sbi); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6a79fd0a1a3..2c62c5aae82 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -97,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); } else { @@ -223,8 +227,8 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + get_bh(bh); if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -243,7 +247,6 @@ restart: */ released = __jbd2_journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); __brelse(bh); } @@ -284,7 +287,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, int ret = 0; if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); + get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -316,12 +319,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; + get_bh(bh); J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); __brelse(bh); } else { /* @@ -554,7 +557,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* * journal_clean_one_cp_list * - * Find all the written-back checkpoint buffers in the given list and release them. + * Find all the written-back checkpoint buffers in the given list and + * release them. * * Called with the journal locked. * Called with j_list_lock held. @@ -663,8 +667,8 @@ out: * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ @@ -684,13 +688,14 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; + jbd2_journal_put_journal_head(jh); if (transaction->t_checkpoint_list != NULL || transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the @@ -701,10 +706,8 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) * The locking here around t_state is a bit sleazy. * See the comment at the end of jbd2_journal_commit_transaction(). */ - if (transaction->t_state != T_FINISHED) { - JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -723,7 +726,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) wake_up(&journal->j_wait_logspace); ret = 1; out: - JBUFFER_TRACE(jh, "exit"); return ret; } @@ -742,6 +744,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + jbd2_journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7f21cf3aaf9..eef6979821a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -848,10 +848,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); @@ -914,28 +920,27 @@ restart_loop: __jbd2_journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* The buffer on BJ_Forget list and not jbddirty means + /* + * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget - * list. */ - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __jbd2_journal_refile_buffer(jh); - if (!jh->b_transaction) { - jbd_unlock_bh_state(bh); - /* needs a brelse */ - jbd2_journal_remove_journal_head(bh); - release_buffer_page(bh); - } else - jbd_unlock_bh_state(bh); + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile buffer"); + __jbd2_journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); /* Drops bh reference */ + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9a782699030..0dfa5b598e6 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2078,10 +2078,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -2094,17 +2093,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = jbd2_journal_add_journal_head(bh); * ... + * (Get another reference for transaction) + * jbd2_journal_grab_journal_head(bh); * jh->b_transaction = xxx; + * (Put original reference) * jbd2_journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) @@ -2168,61 +2166,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __func__); - jbd2_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __func__); - jbd2_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd2_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd2_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void jbd2_journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void jbd2_journal_put_journal_head(struct journal_head *jh) @@ -2232,11 +2198,12 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3eec82d32fd..2d7109414cd 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -30,6 +30,7 @@ #include <linux/module.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); +static void __jbd2_journal_unfile_buffer(struct journal_head *jh); /* * jbd2_get_transaction: obtain a new transaction_t object. @@ -764,7 +765,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); @@ -814,7 +814,6 @@ out: * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes - * @credits: variable that will receive credits for the buffer * * Returns an error code or 0 on success. * @@ -896,8 +895,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); - jh->b_transaction = transaction; - /* first access by this transaction */ jh->b_modified = 0; @@ -932,7 +929,6 @@ out: * non-rewindable consequences * @handle: transaction * @bh: buffer to undo - * @credits: store the number of taken credits here (if not NULL) * * Sometimes there is a need to distinguish between metadata which has * been committed to disk and that which has not. The ext3fs code uses @@ -1232,8 +1228,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); } else { __jbd2_journal_unfile_buffer(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1556,19 +1550,32 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } -void __jbd2_journal_unfile_buffer(struct journal_head *jh) +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ +static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + jbd2_journal_put_journal_head(jh); } void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1595,8 +1602,6 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __jbd2_journal_remove_checkpoint(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1659,7 +1664,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * jbd2_journal_remove_journal_head() and * jbd2_journal_put_journal_head(). */ jh = jbd2_journal_grab_journal_head(bh); @@ -1697,10 +1701,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __jbd2_journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __jbd2_journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in @@ -1711,8 +1714,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); + __jbd2_journal_unfile_buffer(jh); } return may_free; } @@ -1990,6 +1992,8 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __jbd2_journal_temp_unlink_buffer(jh); + else + jbd2_journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2041,9 +2045,10 @@ void jbd2_journal_file_buffer(struct journal_head *jh, * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __jbd2_journal_refile_buffer(struct journal_head *jh) { @@ -2067,6 +2072,11 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __jbd2_journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __jbd2_journal_file_buffer() must not + * take a new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; if (buffer_freed(bh)) @@ -2083,30 +2093,21 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __jbd2_journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __jbd2_journal_refile_buffer() with necessary locking added. We take our + * bh reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index c5ce6c1d1ff..2f3f531f360 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -66,9 +66,9 @@ static int jfs_open(struct inode *inode, struct file *file) struct jfs_inode_info *ji = JFS_IP(inode); spin_lock_irq(&ji->ag_lock); if (ji->active_ag == -1) { - ji->active_ag = ji->agno; - atomic_inc( - &JFS_SBI(inode->i_sb)->bmap->db_active[ji->agno]); + struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); + ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); + atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); } spin_unlock_irq(&ji->ag_lock); } diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index ed53a474016..b78b2f978f0 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -397,7 +397,7 @@ int diRead(struct inode *ip) release_metapage(mp); /* set the ag for the inode */ - JFS_IP(ip)->agno = BLKTOAG(agstart, sbi); + JFS_IP(ip)->agstart = agstart; JFS_IP(ip)->active_ag = -1; return (rc); @@ -901,7 +901,7 @@ int diFree(struct inode *ip) /* get the allocation group for this ino. */ - agno = JFS_IP(ip)->agno; + agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb)); /* Lock the AG specific inode map information */ @@ -1315,12 +1315,11 @@ int diFree(struct inode *ip) static inline void diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) { - struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); struct jfs_inode_info *jfs_ip = JFS_IP(ip); ip->i_ino = (iagno << L2INOSPERIAG) + ino; jfs_ip->ixpxd = iagp->inoext[extno]; - jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); + jfs_ip->agstart = le64_to_cpu(iagp->agstart); jfs_ip->active_ag = -1; } @@ -1379,7 +1378,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) */ /* get the ag number of this iag */ - agno = JFS_IP(pip)->agno; + agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb)); if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { /* @@ -2921,10 +2920,9 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) continue; } - /* agstart that computes to the same ag is treated as same; */ agstart = le64_to_cpu(iagp->agstart); - /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */ n = agstart >> mp->db_agl2size; + iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size); /* compute backed inodes */ numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 1439f119ec8..584a4a1a6e8 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -50,8 +50,9 @@ struct jfs_inode_info { short btindex; /* btpage entry index*/ struct inode *ipimap; /* inode map */ unsigned long cflag; /* commit flags */ + u64 agstart; /* agstart of the containing IAG */ u16 bxflag; /* xflag of pseudo buffer? */ - unchar agno; /* ag number */ + unchar pad; signed char active_ag; /* ag currently allocating from */ lid_t blid; /* lid of pseudo buffer? */ lid_t atlhead; /* anonymous tlock list head */ diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 278e3fb40b7..583636f745e 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1123,7 +1123,7 @@ int lmLogOpen(struct super_block *sb) bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, log); if (IS_ERR(bdev)) { - rc = -PTR_ERR(bdev); + rc = PTR_ERR(bdev); goto free; } diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 8ea5efb5a34..8d0c1c7c082 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -80,7 +80,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) int log_formatted = 0; struct inode *iplist[1]; struct jfs_superblock *j_sb, *j_sb2; - uint old_agsize; + s64 old_agsize; int agsizechanged = 0; struct buffer_head *bh, *bh2; diff --git a/fs/libfs.c b/fs/libfs.c index c88eab55aec..275ca4749a2 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -822,7 +822,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, goto out; attr->set_buf[size] = '\0'; - val = simple_strtol(attr->set_buf, NULL, 0); + val = simple_strtoll(attr->set_buf, NULL, 0); ret = attr->set(attr->data, val); if (ret == 0) ret = len; /* on success, claim we got the whole input */ diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index adb45ec9038..e374050a911 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -708,7 +708,13 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) if (task->tk_status < 0) { dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status); - goto retry_rebind; + switch (task->tk_status) { + case -EACCES: + case -EIO: + goto die; + default: + goto retry_rebind; + } } if (status == NLM_LCK_DENIED_GRACE_PERIOD) { rpc_delay(task, NLMCLNT_GRACE_WAIT); diff --git a/fs/locks.c b/fs/locks.c index 0a4f50dfadf..b286539d547 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -160,10 +160,28 @@ EXPORT_SYMBOL_GPL(unlock_flocks); static struct kmem_cache *filelock_cache __read_mostly; +static void locks_init_lock_always(struct file_lock *fl) +{ + fl->fl_next = NULL; + fl->fl_fasync = NULL; + fl->fl_owner = NULL; + fl->fl_pid = 0; + fl->fl_nspid = NULL; + fl->fl_file = NULL; + fl->fl_flags = 0; + fl->fl_type = 0; + fl->fl_start = fl->fl_end = 0; +} + /* Allocate an empty lock structure. */ struct file_lock *locks_alloc_lock(void) { - return kmem_cache_alloc(filelock_cache, GFP_KERNEL); + struct file_lock *fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); + + if (fl) + locks_init_lock_always(fl); + + return fl; } EXPORT_SYMBOL_GPL(locks_alloc_lock); @@ -200,17 +218,9 @@ void locks_init_lock(struct file_lock *fl) INIT_LIST_HEAD(&fl->fl_link); INIT_LIST_HEAD(&fl->fl_block); init_waitqueue_head(&fl->fl_wait); - fl->fl_next = NULL; - fl->fl_fasync = NULL; - fl->fl_owner = NULL; - fl->fl_pid = 0; - fl->fl_nspid = NULL; - fl->fl_file = NULL; - fl->fl_flags = 0; - fl->fl_type = 0; - fl->fl_start = fl->fl_end = 0; fl->fl_ops = NULL; fl->fl_lmops = NULL; + locks_init_lock_always(fl); } EXPORT_SYMBOL(locks_init_lock); diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 9ed89d1663f..1afae26cf23 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -555,13 +555,6 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, return __logfs_create(dir, dentry, inode, target, destlen); } -static int logfs_permission(struct inode *inode, int mask, unsigned int flags) -{ - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return generic_permission(inode, mask, flags, NULL); -} - static int logfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { @@ -820,7 +813,6 @@ const struct inode_operations logfs_dir_iops = { .mknod = logfs_mknod, .rename = logfs_rename, .rmdir = logfs_rmdir, - .permission = logfs_permission, .symlink = logfs_symlink, .unlink = logfs_unlink, }; diff --git a/fs/namei.c b/fs/namei.c index e2e4e8d032e..14ab8d3f2f0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -238,7 +238,8 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags, /* * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. + * Executable DACs are overridable for all directories and + * for non-directories that have least one exec bit set. */ if (!(mask & MAY_EXEC) || execute_ok(inode)) if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) @@ -432,6 +433,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) goto err_parent; BUG_ON(nd->inode != parent->d_inode); } else { + if (dentry->d_parent != parent) + goto err_parent; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (!__d_rcu_to_refcount(dentry, nd->seq)) goto err_child; @@ -812,6 +815,11 @@ static int follow_automount(struct path *path, unsigned flags, if (!mnt) /* mount collision */ return 0; + if (!*need_mntput) { + /* lock_mount() may release path->mnt on error */ + mntget(path->mnt); + *need_mntput = true; + } err = finish_automount(mnt, path); switch (err) { @@ -819,12 +827,9 @@ static int follow_automount(struct path *path, unsigned flags, /* Someone else made a mount here whilst we were busy */ return 0; case 0: - dput(path->dentry); - if (*need_mntput) - mntput(path->mnt); + path_put(path); path->mnt = mnt; path->dentry = dget(mnt->mnt_root); - *need_mntput = true; return 0; default: return err; @@ -844,9 +849,10 @@ static int follow_automount(struct path *path, unsigned flags, */ static int follow_managed(struct path *path, unsigned flags) { + struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ unsigned managed; bool need_mntput = false; - int ret; + int ret = 0; /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see @@ -861,7 +867,7 @@ static int follow_managed(struct path *path, unsigned flags) BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path->dentry, false); if (ret < 0) - return ret == -EISDIR ? 0 : ret; + break; } /* Transit to a mounted filesystem. */ @@ -887,14 +893,19 @@ static int follow_managed(struct path *path, unsigned flags) if (managed & DCACHE_NEED_AUTOMOUNT) { ret = follow_automount(path, flags, &need_mntput); if (ret < 0) - return ret == -EISDIR ? 0 : ret; + break; continue; } /* We didn't change the current path point */ break; } - return 0; + + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (ret == -EISDIR) + ret = 0; + return ret; } int follow_down_one(struct path *path) @@ -931,7 +942,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - *inode = path->dentry->d_inode; if (unlikely(managed_dentry_might_block(path->dentry))) return false; @@ -944,6 +954,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->mnt = mounted; path->dentry = mounted->mnt_root; nd->seq = read_seqcount_begin(&path->dentry->d_seq); + /* + * Update the inode too. We don't need to re-check the + * dentry sequence number here after this d_inode read, + * because a mount-point is always pinned. + */ + *inode = path->dentry->d_inode; } return true; } @@ -1003,9 +1019,6 @@ failed: * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. - * - * Care must be taken as namespace_sem may be held (indicated by mounting_here - * being true). */ int follow_down(struct path *path) { @@ -2624,6 +2637,10 @@ static long do_rmdir(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit2; + if (!dentry->d_inode) { + error = -ENOENT; + goto exit3; + } error = mnt_want_write(nd.path.mnt); if (error) goto exit3; @@ -2712,8 +2729,9 @@ static long do_unlinkat(int dfd, const char __user *pathname) if (nd.last.name[nd.last.len]) goto slashes; inode = dentry->d_inode; - if (inode) - ihold(inode); + if (!inode) + goto slashes; + ihold(inode); error = mnt_want_write(nd.path.mnt); if (error) goto exit2; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index ce153a6b3ae..419119c371b 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -259,12 +259,10 @@ static void nfs_fscache_disable_inode_cookie(struct inode *inode) dfprintk(FSCACHE, "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); - /* Need to invalidate any mapped pages that were read in before - * turning off the cache. + /* Need to uncache any pages attached to this inode that + * fscache knows about before turning off the cache. */ - if (inode->i_mapping && inode->i_mapping->nrpages) - invalidate_inode_pages2(inode->i_mapping); - + fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode); nfs_fscache_zap_inode_cookie(inode); } } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 144f2a3c718..6f4850deb27 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -256,7 +256,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfs_attr_check_mountpoint(sb, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) + if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) && + !nfs_attr_use_mounted_on_fileid(fattr)) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; @@ -1294,7 +1295,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if (nfsi->npages == 0 || new_isize > cur_isize) { + if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || + new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b9056cbe68d..2a55347a2da 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -45,6 +45,17 @@ static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; } +static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) +{ + if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || + (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && + ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) + return 0; + + fattr->fileid = fattr->mounted_on_fileid; + return 1; +} + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 426908809c9..f9d03abcd04 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -30,6 +30,7 @@ */ #include <linux/nfs_fs.h> +#include <linux/nfs_page.h> #include "internal.h" #include "nfs4filelayout.h" @@ -397,7 +398,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) * this offset and save the original offset. */ data->args.offset = filelayout_get_dserver_offset(lseg, offset); - data->mds_offset = offset; /* Perform an asynchronous write */ status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, @@ -552,13 +552,18 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, __func__, nfl_util, fl->num_fh, fl->first_stripe_index, fl->pattern_offset); - if (!fl->num_fh) + /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. + * Futher checking is done in filelayout_check_layout */ + if (fl->num_fh < 0 || fl->num_fh > + max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) goto out_err; - fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), - gfp_flags); - if (!fl->fh_array) - goto out_err; + if (fl->num_fh > 0) { + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), + gfp_flags); + if (!fl->fh_array) + goto out_err; + } for (i = 0; i < fl->num_fh; i++) { /* Do we want to use a mempool here? */ @@ -661,8 +666,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, u64 p_stripe, r_stripe; u32 stripe_unit; - if (!pnfs_generic_pg_test(pgio, prev, req)) - return 0; + if (!pnfs_generic_pg_test(pgio, prev, req) || + !nfs_generic_pg_test(pgio, prev, req)) + return false; if (!pgio->pg_lseg) return 1; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d2c4b59c896..5879b23e0c9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2265,12 +2265,14 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, return nfs4_map_errors(status); } +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); /* * Get locations and (maybe) other attributes of a referral. * Note that we'll actually follow the referral later when * we detect fsid mismatch in inode revalidation */ -static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle) +static int nfs4_get_referral(struct inode *dir, const struct qstr *name, + struct nfs_fattr *fattr, struct nfs_fh *fhandle) { int status = -ENOMEM; struct page *page = NULL; @@ -2288,15 +2290,16 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct goto out; /* Make sure server returned a different fsid for the referral */ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { - dprintk("%s: server did not return a different fsid for a referral at %s\n", __func__, name->name); + dprintk("%s: server did not return a different fsid for" + " a referral at %s\n", __func__, name->name); status = -EIO; goto out; } + /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ + nfs_fixup_referral_attributes(&locations->fattr); + /* replace the lookup nfs_fattr with the locations nfs_fattr */ memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); - fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL; - if (!fattr->mode) - fattr->mode = S_IFDIR; memset(fhandle, 0, sizeof(struct nfs_fh)); out: if (page) @@ -4667,11 +4670,15 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, return len; } +/* + * nfs_fhget will use either the mounted_on_fileid or the fileid + */ static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) { - if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) && - (fattr->valid & NFS_ATTR_FATTR_FSID) && - (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) + if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) || + (fattr->valid & NFS_ATTR_FATTR_FILEID)) && + (fattr->valid & NFS_ATTR_FATTR_FSID) && + (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) return; fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | @@ -4686,7 +4693,6 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs_server *server = NFS_SERVER(dir); u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, - [1] = FATTR4_WORD1_MOUNTED_ON_FILEID, }; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), @@ -4705,11 +4711,18 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, int status; dprintk("%s: start\n", __func__); + + /* Ask for the fileid of the absent filesystem if mounted_on_fileid + * is not supported */ + if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + else + bitmask[0] |= FATTR4_WORD0_FILEID; + nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); - nfs_fixup_referral_attributes(&fs_locations->fattr); dprintk("%s: returned status = %d\n", __func__, status); return status; } @@ -5098,7 +5111,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) if (mxresp_sz == 0) mxresp_sz = NFS_MAX_FILE_IO_SIZE; /* Fore channel attributes */ - args->fc_attrs.headerpadsz = 0; args->fc_attrs.max_rqst_sz = mxrqst_sz; args->fc_attrs.max_resp_sz = mxresp_sz; args->fc_attrs.max_ops = NFS4_MAX_OPS; @@ -5111,7 +5123,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ - args->bc_attrs.headerpadsz = 0; args->bc_attrs.max_rqst_sz = PAGE_SIZE; args->bc_attrs.max_resp_sz = PAGE_SIZE; args->bc_attrs.max_resp_sz_cached = 0; @@ -5131,8 +5142,6 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args struct nfs4_channel_attrs *sent = &args->fc_attrs; struct nfs4_channel_attrs *rcvd = &session->fc_attrs; - if (rcvd->headerpadsz > sent->headerpadsz) - return -EINVAL; if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; /* @@ -5697,6 +5706,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct nfs_server *server; + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; dprintk("--> %s\n", __func__); @@ -5708,16 +5718,15 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) nfs_restart_rpc(task, lrp->clp); return; } + spin_lock(&lo->plh_inode->i_lock); if (task->tk_status == 0) { - struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; - if (lrp->res.lrs_present) { - spin_lock(&lo->plh_inode->i_lock); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - spin_unlock(&lo->plh_inode->i_lock); } else BUG_ON(!list_empty(&lo->plh_segs)); } + lo->plh_block_lgets--; + spin_unlock(&lo->plh_inode->i_lock); dprintk("<-- %s\n", __func__); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index d869a5e5464..e6e8f3b9a1d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -91,7 +91,7 @@ static int nfs4_stat_to_errno(int); #define encode_getfh_maxsz (op_encode_hdr_maxsz) #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ ((3+NFS4_FHSIZE) >> 2)) -#define nfs4_fattr_bitmap_maxsz 3 +#define nfs4_fattr_bitmap_maxsz 4 #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) @@ -255,7 +255,7 @@ static int nfs4_stat_to_errno(int); #define decode_fs_locations_maxsz \ (0) #define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) -#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN))) +#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) #if defined(CONFIG_NFS_V4_1) #define NFS4_MAX_MACHINE_NAME_LEN (64) @@ -1725,7 +1725,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(args->flags); /*flags */ /* Fore Channel */ - *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(0); /* header padding size */ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */ @@ -1734,7 +1734,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ /* Back Channel */ - *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(0); /* header padding size */ *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ @@ -3098,7 +3098,7 @@ out_overflow: return -EIO; } -static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) +static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) { __be32 *p; @@ -3109,7 +3109,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) if (unlikely(!p)) goto out_overflow; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; - return -be32_to_cpup(p); + *res = -be32_to_cpup(p); } return 0; out_overflow: @@ -4070,6 +4070,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, int status; umode_t fmode = 0; uint32_t type; + int32_t err; status = decode_attr_type(xdr, bitmap, &type); if (status < 0) @@ -4095,13 +4096,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_error(xdr, bitmap); - if (status == -NFS4ERR_WRONGSEC) { - nfs_fixup_secinfo_attributes(fattr, fh); - status = 0; - } + err = 0; + status = decode_attr_error(xdr, bitmap, &err); if (status < 0) goto xdr_error; + if (err == -NFS4ERR_WRONGSEC) + nfs_fixup_secinfo_attributes(fattr, fh); status = decode_attr_filehandle(xdr, bitmap, fh); if (status < 0) @@ -4997,12 +4997,14 @@ static int decode_chan_attrs(struct xdr_stream *xdr, struct nfs4_channel_attrs *attrs) { __be32 *p; - u32 nr_attrs; + u32 nr_attrs, val; p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) goto out_overflow; - attrs->headerpadsz = be32_to_cpup(p++); + val = be32_to_cpup(p++); /* headerpadsz */ + if (val) + return -EINVAL; /* no support for header padding yet */ attrs->max_rqst_sz = be32_to_cpup(p++); attrs->max_resp_sz = be32_to_cpup(p++); attrs->max_resp_sz_cached = be32_to_cpup(p++); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 9cf208df1f2..8ff2ea3f10e 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -108,7 +108,6 @@ _dev_list_add(const struct nfs_server *nfss, de = n; } - atomic_inc(&de->id_node.ref); return de; } @@ -1001,6 +1000,9 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, if (!pnfs_generic_pg_test(pgio, prev, req)) return false; + if (pgio->pg_lseg == NULL) + return true; + return pgio->pg_count + req->wb_bytes <= OBJIO_LSEG(pgio->pg_lseg)->max_io_size; } diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index dc3956c0de8..1d06f8e2ade 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -291,7 +291,7 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) struct nfs_read_data *rdata; state->status = status; - dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); + dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); rdata = state->rpcdata; rdata->task.tk_status = status; if (status >= 0) { diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7913961aff2..00985571628 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -204,7 +204,7 @@ nfs_wait_on_request(struct nfs_page *req) TASK_UNINTERRUPTIBLE); } -static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) +bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) { /* * FIXME: ideally we should be able to coalesce all requests @@ -218,6 +218,7 @@ static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_p return desc->pg_count + req->wb_bytes <= desc->pg_bsize; } +EXPORT_SYMBOL_GPL(nfs_generic_pg_test); /** * nfs_pageio_init - initialise a page io descriptor diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8c1309d852a..29c0ca7fc34 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -634,14 +634,16 @@ _pnfs_return_layout(struct inode *ino) spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) { + if (!lo) { spin_unlock(&ino->i_lock); - dprintk("%s: no layout segments to return\n", __func__); - goto out; + dprintk("%s: no layout to return\n", __func__); + return status; } stateid = nfsi->layout->plh_stateid; /* Reference matched in nfs4_layoutreturn_release */ get_layout_hdr(lo); + mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + lo->plh_block_lgets++; spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); @@ -650,6 +652,9 @@ _pnfs_return_layout(struct inode *ino) lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; + set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); + set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); + put_layout_hdr(lo); goto out; } @@ -887,7 +892,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, ret = get_lseg(lseg); break; } - if (cmp_layout(range, &lseg->pls_range) > 0) + if (lseg->pls_range.offset > range->offset) break; } @@ -1059,23 +1064,36 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, gfp_flags = GFP_NOFS; } - if (pgio->pg_count == prev->wb_bytes) { + if (pgio->pg_lseg == NULL) { + if (pgio->pg_count != prev->wb_bytes) + return true; /* This is first coelesce call for a series of nfs_pages */ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, prev->wb_context, - req_offset(req), + req_offset(prev), pgio->pg_count, access_type, gfp_flags); - return true; + if (pgio->pg_lseg == NULL) + return true; } - if (pgio->pg_lseg && - req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset, - pgio->pg_lseg->pls_range.length)) - return false; - - return true; + /* + * Test if a nfs_page is fully contained in the pnfs_layout_range. + * Note that this test makes several assumptions: + * - that the previous nfs_page in the struct nfs_pageio_descriptor + * is known to lie within the range. + * - that the nfs_page being tested is known to be contiguous with the + * previous nfs_page. + * - Layout ranges are page aligned, so we only have to test the + * start offset of the request. + * + * Please also note that 'end_offset' is actually the offset of the + * first byte that lies outside the pnfs_layout_range. FIXME? + * + */ + return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 48d0a8e4d06..96bf4e6f45b 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -186,6 +186,7 @@ int pnfs_ld_read_done(struct nfs_read_data *); /* pnfs_dev.c */ struct nfs4_deviceid_node { struct hlist_node node; + struct hlist_node tmpnode; const struct pnfs_layoutdriver_type *ld; const struct nfs_client *nfs_client; struct nfs4_deviceid deviceid; diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index c65e133ce9c..f0f8e1e22f6 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -174,6 +174,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, const struct nfs4_deviceid *id) { INIT_HLIST_NODE(&d->node); + INIT_HLIST_NODE(&d->tmpnode); d->ld = ld; d->nfs_client = nfs_client; d->deviceid = *id; @@ -208,6 +209,7 @@ nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); spin_unlock(&nfs4_deviceid_lock); + atomic_inc(&new->ref); return new; } @@ -238,24 +240,29 @@ static void _deviceid_purge_client(const struct nfs_client *clp, long hash) { struct nfs4_deviceid_node *d; - struct hlist_node *n, *next; + struct hlist_node *n; HLIST_HEAD(tmp); + spin_lock(&nfs4_deviceid_lock); rcu_read_lock(); hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); - hlist_add_head(&d->node, &tmp); + hlist_add_head(&d->tmpnode, &tmp); } rcu_read_unlock(); + spin_unlock(&nfs4_deviceid_lock); if (hlist_empty(&tmp)) return; synchronize_rcu(); - hlist_for_each_entry_safe(d, n, next, &tmp, node) + while (!hlist_empty(&tmp)) { + d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); + hlist_del(&d->tmpnode); if (atomic_dec_and_test(&d->ref)) d->ld->free_deviceid_node(d); + } } void @@ -263,8 +270,8 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp) { long h; - spin_lock(&nfs4_deviceid_lock); + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) + return; for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) _deviceid_purge_client(clp, h); - spin_unlock(&nfs4_deviceid_lock); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e268e3b2349..72716805968 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -864,6 +864,8 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; + /* pnfs_set_layoutcommit needs this */ + data->mds_offset = data->args.offset; data->args.pgbase = req->wb_pgbase + offset; data->args.pages = data->pagevec; data->args.count = count; diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 18b3e8975fe..fbb2a5ef581 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -82,6 +82,7 @@ config NFSD_V4 select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS + select CRYPTO help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1f5eae40f34..2b1449dd2f4 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -13,6 +13,7 @@ #include <linux/lockd/lockd.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> +#include <linux/sunrpc/gss_krb5_enctypes.h> #include "idmap.h" #include "nfsd.h" @@ -189,18 +190,10 @@ static struct file_operations export_features_operations = { .release = single_release, }; -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) static int supported_enctypes_show(struct seq_file *m, void *v) { - struct gss_api_mech *k5mech; - - k5mech = gss_mech_get_by_name("krb5"); - if (k5mech == NULL) - goto out; - if (k5mech->gm_upcall_enctypes != NULL) - seq_printf(m, k5mech->gm_upcall_enctypes); - gss_mech_put(k5mech); -out: + seq_printf(m, KRB5_SUPPORTED_ENCTYPES); return 0; } @@ -215,7 +208,7 @@ static struct file_operations supported_enctypes_ops = { .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); @@ -1427,9 +1420,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d5718273bb3..fd0acca5370 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -696,7 +696,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor } #endif /* CONFIG_NFSD_V3 */ +static int nfsd_open_break_lease(struct inode *inode, int access) +{ + unsigned int mode; + if (access & NFSD_MAY_NOT_BREAK_LEASE) + return 0; + mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; + return break_lease(inode, mode | O_NONBLOCK); +} /* * Open an existing file or directory. @@ -744,12 +752,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!inode->i_fop) goto out; - /* - * Check to see if there are any leases on this file. - * This may block while leases are broken. - */ - if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) - host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); + host_err = nfsd_open_break_lease(inode, access); if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; @@ -1660,8 +1663,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (!dold->d_inode) goto out_drop_write; host_err = nfsd_break_lease(dold->d_inode); - if (host_err) + if (host_err) { + err = nfserrno(host_err); goto out_drop_write; + } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 7eafe468a29..b2e3ff34762 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1346,6 +1346,11 @@ static void nilfs_btree_shrink(struct nilfs_bmap *btree, path[level].bp_bh = NULL; } +static void nilfs_btree_nop(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ +} static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, @@ -1356,20 +1361,19 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; - int pindex, level, ncmin, ncmax, ncblk, ret; + int pindex, dindex, level, ncmin, ncmax, ncblk, ret; ret = 0; stats->bs_nblocks = 0; ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); ncblk = nilfs_btree_nchildren_per_block(btree); - for (level = NILFS_BTREE_LEVEL_NODE_MIN; + for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index; level < nilfs_btree_height(btree) - 1; level++) { node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(node, path[level].bp_index, - ncblk); + nilfs_btree_node_get_ptr(node, dindex, ncblk); ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) @@ -1383,6 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); pindex = path[level + 1].bp_index; + dindex = pindex; if (pindex > 0) { /* left sibling */ @@ -1421,6 +1426,14 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_concat_right; stats->bs_nblocks++; + /* + * When merging right sibling node + * into the current node, pointer to + * the right sibling node must be + * terminated instead. The adjustment + * below is required for that. + */ + dindex = pindex + 1; /* continue; */ } } else { @@ -1431,29 +1444,31 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_shrink; stats->bs_nblocks += 2; + level++; + path[level].bp_op = nilfs_btree_nop; + goto shrink_root_child; } else { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; + goto out; } - - goto out; - } } + /* child of the root node is deleted */ + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + +shrink_root_child: node = nilfs_btree_get_root(btree); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(node, path[level].bp_index, + nilfs_btree_node_get_ptr(node, dindex, NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; - /* child of the root node is deleted */ - path[level].bp_op = nilfs_btree_do_delete; - stats->bs_nblocks++; - /* success */ out: *levelp = level; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b954878ad6c..b9b45fc2903 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -801,12 +801,7 @@ out_err: int nilfs_permission(struct inode *inode, int mask, unsigned int flags) { - struct nilfs_root *root; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - - root = NILFS_I(inode)->i_root; + struct nilfs_root *root = NILFS_I(inode)->i_root; if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 141646e88fb..bb24ab6c282 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2573,7 +2573,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; if (nilfs->ns_interval) - sci->sc_interval = nilfs->ns_interval; + sci->sc_interval = HZ * nilfs->ns_interval; if (nilfs->ns_watermark) sci->sc_watermark = nilfs->ns_watermark; return sci; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index cdbaf5e9730..56f61027236 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1072,7 +1072,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OCFS2_SUPER_MAGIC; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d738a7e493d..2c6d95257a4 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -4,7 +4,6 @@ * Released under GPL v2. */ -#include <linux/version.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/buffer_head.h> diff --git a/fs/proc/base.c b/fs/proc/base.c index 14def991d9d..fc5bc276769 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2169,11 +2169,7 @@ static const struct file_operations proc_fd_operations = { */ static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) { - int rv; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - rv = generic_permission(inode, mask, flags, NULL); + int rv = generic_permission(inode, mask, flags, NULL); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode)) @@ -2712,6 +2708,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) struct task_io_accounting acct = task->ioac; unsigned long flags; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + return -EACCES; + if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; @@ -2843,7 +2842,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tgid_io_accounting), + INF("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), @@ -3185,7 +3184,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tid_io_accounting), + INF("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 781dec5bd68..be177f702ac 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -38,18 +38,21 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); + void *ns; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; + ns = ns_ops->get(task); + if (!ns) + goto out_iput; + ei = PROC_I(inode); inode->i_mode = S_IFREG|S_IRUSR; inode->i_fop = &ns_file_operations; ei->ns_ops = ns_ops; - ei->ns = ns_ops->get(task); - if (!ei->ns) - goto out_iput; + ei->ns = ns; dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f50133c11c2..d167de365a8 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -304,9 +304,6 @@ static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) struct ctl_table *table; int error; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; diff --git a/fs/proc/root.c b/fs/proc/root.c index a9000e9cfee..d6c3b416529 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -28,11 +28,12 @@ static int proc_test_super(struct super_block *sb, void *data) static int proc_set_super(struct super_block *sb, void *data) { - struct pid_namespace *ns; - - ns = (struct pid_namespace *)data; - sb->s_fs_info = get_pid_ns(ns); - return set_anon_super(sb, NULL); + int err = set_anon_super(sb, NULL); + if (!err) { + struct pid_namespace *ns = (struct pid_namespace *)data; + sb->s_fs_info = get_pid_ns(ns); + } + return err; } static struct dentry *proc_mount(struct file_system_type *fs_type, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e8a62f41b45..d7808969096 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -954,8 +954,6 @@ static int xattr_mount_check(struct super_block *s) int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; /* * We don't do permission checks on the internal objects. * Permissions are determined by the "owning" object. diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index f0511e81696..eed99428f10 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -27,14 +27,18 @@ static unsigned long romfs_get_unmapped_area(struct file *file, { struct inode *inode = file->f_mapping->host; struct mtd_info *mtd = inode->i_sb->s_mtd; - unsigned long isize, offset; + unsigned long isize, offset, maxpages, lpages; if (!mtd) goto cant_map_directly; + /* the mapping mustn't extend beyond the EOF */ + lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; isize = i_size_read(inode); offset = pgoff << PAGE_SHIFT; - if (offset > isize || len > isize || offset > isize - len) + + maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; + if ((pgoff >= maxpages) || (maxpages - pgoff < lpages)) return (unsigned long) -EINVAL; /* we need to call down to the MTD layer to do the actual mapping */ diff --git a/fs/super.c b/fs/super.c index c75593953c5..ab3d672db0d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -822,7 +822,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; + s->s_flags = flags | MS_NOSEC; s->s_mode = mode; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 266895783b4..e34f0d99ea4 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -95,6 +95,14 @@ static int sysfs_set_super(struct super_block *sb, void *data) return error; } +static void free_sysfs_super_info(struct sysfs_super_info *info) +{ + int type; + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + kobj_ns_drop(type, info->ns[type]); + kfree(info); +} + static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -108,11 +116,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, return ERR_PTR(-ENOMEM); for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - info->ns[type] = kobj_ns_current(type); + info->ns[type] = kobj_ns_grab_current(type); sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); if (IS_ERR(sb) || sb->s_fs_info != info) - kfree(info); + free_sysfs_super_info(info); if (IS_ERR(sb)) return ERR_CAST(sb); if (!sb->s_root) { @@ -131,12 +139,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, static void sysfs_kill_sb(struct super_block *sb) { struct sysfs_super_info *info = sysfs_info(sb); - /* Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing sysfs_super_info. */ kill_anon_super(sb); - kfree(info); + free_sysfs_super_info(info); } static struct file_system_type sysfs_fs_type = { @@ -145,28 +152,6 @@ static struct file_system_type sysfs_fs_type = { .kill_sb = sysfs_kill_sb, }; -void sysfs_exit_ns(enum kobj_ns_type type, const void *ns) -{ - struct super_block *sb; - - mutex_lock(&sysfs_mutex); - spin_lock(&sb_lock); - list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) { - struct sysfs_super_info *info = sysfs_info(sb); - /* - * If we see a superblock on the fs_supers/s_instances - * list the unmount has not completed and sb->s_fs_info - * points to a valid struct sysfs_super_info. - */ - /* Ignore superblocks with the wrong ns */ - if (info->ns[type] != ns) - continue; - info->ns[type] = NULL; - } - spin_unlock(&sb_lock); - mutex_unlock(&sysfs_mutex); -} - int __init sysfs_init(void) { int err = -ENOMEM; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3d28af31d86..2ed2404f311 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -136,7 +136,7 @@ struct sysfs_addrm_cxt { * instance). */ struct sysfs_super_info { - const void *ns[KOBJ_NS_TYPES]; + void *ns[KOBJ_NS_TYPES]; }; #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) extern struct sysfs_dirent sysfs_root; diff --git a/fs/timerfd.c b/fs/timerfd.c index f67acbdda5e..dffeb3795af 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -61,7 +61,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) /* * Called when the clock was set to cancel the timers in the cancel - * list. + * list. This will wake up processes waiting on these timers. The + * wake-up requires ctx->ticks to be non zero, therefore we increment + * it before calling wake_up_locked(). */ void timerfd_clock_was_set(void) { @@ -76,6 +78,7 @@ void timerfd_clock_was_set(void) spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->moffs.tv64 != moffs.tv64) { ctx->moffs.tv64 = KTIME_MAX; + ctx->ticks++; wake_up_locked(&ctx->wqh); } spin_unlock_irqrestore(&ctx->wqh.lock, flags); diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 87cd0ead863..fb3b5c813a3 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -78,7 +78,7 @@ static int nothing_to_commit(struct ubifs_info *c) * If the root TNC node is dirty, we definitely have something to * commit. */ - if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags)) + if (c->zroot.znode && ubifs_zn_dirty(c->zroot.znode)) return 0; /* @@ -418,7 +418,7 @@ int ubifs_run_commit(struct ubifs_info *c) spin_lock(&c->cs_lock); if (c->cmt_state == COMMIT_BROKEN) { - err = -EINVAL; + err = -EROFS; goto out; } @@ -444,7 +444,7 @@ int ubifs_run_commit(struct ubifs_info *c) * re-check it. */ if (c->cmt_state == COMMIT_BROKEN) { - err = -EINVAL; + err = -EROFS; goto out_cmt_unlock; } @@ -576,7 +576,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) struct idx_node *i; size_t sz; - if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX)) + if (!dbg_is_chk_index(c)) return 0; INIT_LIST_HEAD(&list); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0bb2bcef0de..eef109a1a92 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -27,13 +27,12 @@ * various local functions of those subsystems. */ -#define UBIFS_DBG_PRESERVE_UBI - -#include "ubifs.h" #include <linux/module.h> -#include <linux/moduleparam.h> #include <linux/debugfs.h> #include <linux/math64.h> +#include <linux/uaccess.h> +#include <linux/random.h> +#include "ubifs.h" #ifdef CONFIG_UBIFS_FS_DEBUG @@ -42,15 +41,6 @@ DEFINE_SPINLOCK(dbg_lock); static char dbg_key_buf0[128]; static char dbg_key_buf1[128]; -unsigned int ubifs_chk_flags; -unsigned int ubifs_tst_flags; - -module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); -module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); - -MODULE_PARM_DESC(debug_chks, "Debug check flags"); -MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); - static const char *get_key_fmt(int fmt) { switch (fmt) { @@ -91,6 +81,28 @@ static const char *get_key_type(int type) } } +static const char *get_dent_type(int type) +{ + switch (type) { + case UBIFS_ITYPE_REG: + return "file"; + case UBIFS_ITYPE_DIR: + return "dir"; + case UBIFS_ITYPE_LNK: + return "symlink"; + case UBIFS_ITYPE_BLK: + return "blkdev"; + case UBIFS_ITYPE_CHR: + return "char dev"; + case UBIFS_ITYPE_FIFO: + return "fifo"; + case UBIFS_ITYPE_SOCK: + return "socket"; + default: + return "unknown/invalid type"; + } +} + static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, char *buffer) { @@ -234,9 +246,13 @@ static void dump_ch(const struct ubifs_ch *ch) printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len)); } -void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) +void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); + struct qstr nm = { .name = NULL }; + union ubifs_key key; + struct ubifs_dent_node *dent, *pdent = NULL; + int count = 2; printk(KERN_DEBUG "Dump in-memory inode:"); printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino); @@ -270,6 +286,32 @@ void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row); printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len); + + if (!S_ISDIR(inode->i_mode)) + return; + + printk(KERN_DEBUG "List of directory entries:\n"); + ubifs_assert(!mutex_is_locked(&c->tnc_mutex)); + + lowest_dent_key(c, &key, inode->i_ino); + while (1) { + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + if (PTR_ERR(dent) != -ENOENT) + printk(KERN_DEBUG "error %ld\n", PTR_ERR(dent)); + break; + } + + printk(KERN_DEBUG "\t%d: %s (%s)\n", + count++, dent->name, get_dent_type(dent->type)); + + nm.name = dent->name; + nm.len = le16_to_cpu(dent->nlen); + kfree(pdent); + pdent = dent; + key_read(c, &dent->key, &key); + } + kfree(pdent); } void dbg_dump_node(const struct ubifs_info *c, const void *node) @@ -278,7 +320,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) union ubifs_key key; const struct ubifs_ch *ch = node; - if (dbg_failure_mode) + if (dbg_is_tst_rcvry(c)) return; /* If the magic is incorrect, just hexdump the first bytes */ @@ -834,7 +876,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) struct ubifs_scan_node *snod; void *buf; - if (dbg_failure_mode) + if (dbg_is_tst_rcvry(c)) return; printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", @@ -1080,6 +1122,7 @@ out: /** * dbg_check_synced_i_size - check synchronized inode size. + * @c: UBIFS file-system description object * @inode: inode to check * * If inode is clean, synchronized inode size has to be equivalent to current @@ -1087,12 +1130,12 @@ out: * has to be locked). Returns %0 if synchronized inode size if correct, and * %-EINVAL if not. */ -int dbg_check_synced_i_size(struct inode *inode) +int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) { int err = 0; struct ubifs_inode *ui = ubifs_inode(inode); - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (!S_ISREG(inode->i_mode)) return 0; @@ -1125,7 +1168,7 @@ int dbg_check_synced_i_size(struct inode *inode) * Note, it is good idea to make sure the @dir->i_mutex is locked before * calling this function. */ -int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) +int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) { unsigned int nlink = 2; union ubifs_key key; @@ -1133,7 +1176,7 @@ int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) struct qstr nm = { .name = NULL }; loff_t size = UBIFS_INO_NODE_SZ; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (!S_ISDIR(dir->i_mode)) @@ -1167,12 +1210,14 @@ int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) "but calculated size is %llu", dir->i_ino, (unsigned long long)i_size_read(dir), (unsigned long long)size); + dbg_dump_inode(c, dir); dump_stack(); return -EINVAL; } if (dir->i_nlink != nlink) { ubifs_err("directory inode %lu has nlink %u, but calculated " "nlink is %u", dir->i_ino, dir->i_nlink, nlink); + dbg_dump_inode(c, dir); dump_stack(); return -EINVAL; } @@ -1489,7 +1534,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) long clean_cnt = 0, dirty_cnt = 0; int err, last; - if (!(ubifs_chk_flags & UBIFS_CHK_TNC)) + if (!dbg_is_chk_index(c)) return 0; ubifs_assert(mutex_is_locked(&c->tnc_mutex)); @@ -1736,7 +1781,7 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) int err; long long calc = 0; - if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ)) + if (!dbg_is_chk_index(c)) return 0; err = dbg_walk_index(c, NULL, add_size, &calc); @@ -2312,7 +2357,7 @@ int dbg_check_filesystem(struct ubifs_info *c) int err; struct fsck_data fsckd; - if (!(ubifs_chk_flags & UBIFS_CHK_FS)) + if (!dbg_is_chk_fs(c)) return 0; fsckd.inodes = RB_ROOT; @@ -2347,7 +2392,7 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) struct list_head *cur; struct ubifs_scan_node *sa, *sb; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; for (cur = head->next; cur->next != head; cur = cur->next) { @@ -2414,7 +2459,7 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) struct list_head *cur; struct ubifs_scan_node *sa, *sb; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; for (cur = head->next; cur->next != head; cur = cur->next) { @@ -2491,214 +2536,141 @@ error_dump: return 0; } -int dbg_force_in_the_gaps(void) +static inline int chance(unsigned int n, unsigned int out_of) { - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) - return 0; + return !!((random32() % out_of) + 1 <= n); - return !(random32() & 7); } -/* Failure mode for recovery testing */ - -#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d)) - -struct failure_mode_info { - struct list_head list; - struct ubifs_info *c; -}; - -static LIST_HEAD(fmi_list); -static DEFINE_SPINLOCK(fmi_lock); - -static unsigned int next; - -static int simple_rand(void) -{ - if (next == 0) - next = current->pid; - next = next * 1103515245 + 12345; - return (next >> 16) & 32767; -} - -static void failure_mode_init(struct ubifs_info *c) -{ - struct failure_mode_info *fmi; - - fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS); - if (!fmi) { - ubifs_err("Failed to register failure mode - no memory"); - return; - } - fmi->c = c; - spin_lock(&fmi_lock); - list_add_tail(&fmi->list, &fmi_list); - spin_unlock(&fmi_lock); -} - -static void failure_mode_exit(struct ubifs_info *c) +static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) { - struct failure_mode_info *fmi, *tmp; - - spin_lock(&fmi_lock); - list_for_each_entry_safe(fmi, tmp, &fmi_list, list) - if (fmi->c == c) { - list_del(&fmi->list); - kfree(fmi); - } - spin_unlock(&fmi_lock); -} - -static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc) -{ - struct failure_mode_info *fmi; - - spin_lock(&fmi_lock); - list_for_each_entry(fmi, &fmi_list, list) - if (fmi->c->ubi == desc) { - struct ubifs_info *c = fmi->c; - - spin_unlock(&fmi_lock); - return c; - } - spin_unlock(&fmi_lock); - return NULL; -} - -static int in_failure_mode(struct ubi_volume_desc *desc) -{ - struct ubifs_info *c = dbg_find_info(desc); - - if (c && dbg_failure_mode) - return c->dbg->failure_mode; - return 0; -} + struct ubifs_debug_info *d = c->dbg; -static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) -{ - struct ubifs_info *c = dbg_find_info(desc); - struct ubifs_debug_info *d; + ubifs_assert(dbg_is_tst_rcvry(c)); - if (!c || !dbg_failure_mode) - return 0; - d = c->dbg; - if (d->failure_mode) - return 1; - if (!d->fail_cnt) { - /* First call - decide delay to failure */ + if (!d->pc_cnt) { + /* First call - decide delay to the power cut */ if (chance(1, 2)) { - unsigned int delay = 1 << (simple_rand() >> 11); + unsigned long delay; if (chance(1, 2)) { - d->fail_delay = 1; - d->fail_timeout = jiffies + - msecs_to_jiffies(delay); - dbg_rcvry("failing after %ums", delay); + d->pc_delay = 1; + /* Fail withing 1 minute */ + delay = random32() % 60000; + d->pc_timeout = jiffies; + d->pc_timeout += msecs_to_jiffies(delay); + ubifs_warn("failing after %lums", delay); } else { - d->fail_delay = 2; - d->fail_cnt_max = delay; - dbg_rcvry("failing after %u calls", delay); + d->pc_delay = 2; + delay = random32() % 10000; + /* Fail within 10000 operations */ + d->pc_cnt_max = delay; + ubifs_warn("failing after %lu calls", delay); } } - d->fail_cnt += 1; + + d->pc_cnt += 1; } + /* Determine if failure delay has expired */ - if (d->fail_delay == 1) { - if (time_before(jiffies, d->fail_timeout)) + if (d->pc_delay == 1 && time_before(jiffies, d->pc_timeout)) return 0; - } else if (d->fail_delay == 2) - if (d->fail_cnt++ < d->fail_cnt_max) + if (d->pc_delay == 2 && d->pc_cnt++ < d->pc_cnt_max) return 0; + if (lnum == UBIFS_SB_LNUM) { - if (write) { - if (chance(1, 2)) - return 0; - } else if (chance(19, 20)) + if (write && chance(1, 2)) + return 0; + if (chance(19, 20)) return 0; - dbg_rcvry("failing in super block LEB %d", lnum); + ubifs_warn("failing in super block LEB %d", lnum); } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) { if (chance(19, 20)) return 0; - dbg_rcvry("failing in master LEB %d", lnum); + ubifs_warn("failing in master LEB %d", lnum); } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) { - if (write) { - if (chance(99, 100)) - return 0; - } else if (chance(399, 400)) + if (write && chance(99, 100)) return 0; - dbg_rcvry("failing in log LEB %d", lnum); + if (chance(399, 400)) + return 0; + ubifs_warn("failing in log LEB %d", lnum); } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) { - if (write) { - if (chance(7, 8)) - return 0; - } else if (chance(19, 20)) + if (write && chance(7, 8)) return 0; - dbg_rcvry("failing in LPT LEB %d", lnum); + if (chance(19, 20)) + return 0; + ubifs_warn("failing in LPT LEB %d", lnum); } else if (lnum >= c->orph_first && lnum <= c->orph_last) { - if (write) { - if (chance(1, 2)) - return 0; - } else if (chance(9, 10)) + if (write && chance(1, 2)) + return 0; + if (chance(9, 10)) return 0; - dbg_rcvry("failing in orphan LEB %d", lnum); + ubifs_warn("failing in orphan LEB %d", lnum); } else if (lnum == c->ihead_lnum) { if (chance(99, 100)) return 0; - dbg_rcvry("failing in index head LEB %d", lnum); + ubifs_warn("failing in index head LEB %d", lnum); } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) { if (chance(9, 10)) return 0; - dbg_rcvry("failing in GC head LEB %d", lnum); + ubifs_warn("failing in GC head LEB %d", lnum); } else if (write && !RB_EMPTY_ROOT(&c->buds) && !ubifs_search_bud(c, lnum)) { if (chance(19, 20)) return 0; - dbg_rcvry("failing in non-bud LEB %d", lnum); + ubifs_warn("failing in non-bud LEB %d", lnum); } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND || c->cmt_state == COMMIT_RUNNING_REQUIRED) { if (chance(999, 1000)) return 0; - dbg_rcvry("failing in bud LEB %d commit running", lnum); + ubifs_warn("failing in bud LEB %d commit running", lnum); } else { if (chance(9999, 10000)) return 0; - dbg_rcvry("failing in bud LEB %d commit not running", lnum); + ubifs_warn("failing in bud LEB %d commit not running", lnum); } - ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum); - d->failure_mode = 1; + + d->pc_happened = 1; + ubifs_warn("========== Power cut emulated =========="); dump_stack(); return 1; } -static void cut_data(const void *buf, int len) +static void cut_data(const void *buf, unsigned int len) { - int flen, i; + unsigned int from, to, i, ffs = chance(1, 2); unsigned char *p = (void *)buf; - flen = (len * (long long)simple_rand()) >> 15; - for (i = flen; i < len; i++) - p[i] = 0xff; -} + from = random32() % (len + 1); + if (chance(1, 2)) + to = random32() % (len - from + 1); + else + to = len; -int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, - int len, int check) -{ - if (in_failure_mode(desc)) - return -EROFS; - return ubi_leb_read(desc, lnum, buf, offset, len, check); + if (from < to) + ubifs_warn("filled bytes %u-%u with %s", from, to - 1, + ffs ? "0xFFs" : "random data"); + + if (ffs) + for (i = from; i < to; i++) + p[i] = 0xFF; + else + for (i = from; i < to; i++) + p[i] = random32() % 0x100; } -int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, - int offset, int len, int dtype) +int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, + int offs, int len, int dtype) { int err, failing; - if (in_failure_mode(desc)) + if (c->dbg->pc_happened) return -EROFS; - failing = do_fail(desc, lnum, 1); + + failing = power_cut_emulated(c, lnum, 1); if (failing) cut_data(buf, len); - err = ubi_leb_write(desc, lnum, buf, offset, len, dtype); + err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); if (err) return err; if (failing) @@ -2706,162 +2678,207 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, return 0; } -int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, int dtype) { int err; - if (do_fail(desc, lnum, 1)) + if (c->dbg->pc_happened) return -EROFS; - err = ubi_leb_change(desc, lnum, buf, len, dtype); + if (power_cut_emulated(c, lnum, 1)) + return -EROFS; + err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); if (err) return err; - if (do_fail(desc, lnum, 1)) + if (power_cut_emulated(c, lnum, 1)) return -EROFS; return 0; } -int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum) +int dbg_leb_unmap(struct ubifs_info *c, int lnum) { int err; - if (do_fail(desc, lnum, 0)) + if (c->dbg->pc_happened) + return -EROFS; + if (power_cut_emulated(c, lnum, 0)) return -EROFS; - err = ubi_leb_erase(desc, lnum); + err = ubi_leb_unmap(c->ubi, lnum); if (err) return err; - if (do_fail(desc, lnum, 0)) + if (power_cut_emulated(c, lnum, 0)) return -EROFS; return 0; } -int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum) +int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype) { int err; - if (do_fail(desc, lnum, 0)) + if (c->dbg->pc_happened) return -EROFS; - err = ubi_leb_unmap(desc, lnum); + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; + err = ubi_leb_map(c->ubi, lnum, dtype); if (err) return err; - if (do_fail(desc, lnum, 0)) + if (power_cut_emulated(c, lnum, 0)) return -EROFS; return 0; } -int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) -{ - if (in_failure_mode(desc)) - return -EROFS; - return ubi_is_mapped(desc, lnum); -} +/* + * Root directory for UBIFS stuff in debugfs. Contains sub-directories which + * contain the stuff specific to particular file-system mounts. + */ +static struct dentry *dfs_rootdir; -int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) +static int dfs_file_open(struct inode *inode, struct file *file) { - int err; - - if (do_fail(desc, lnum, 0)) - return -EROFS; - err = ubi_leb_map(desc, lnum, dtype); - if (err) - return err; - if (do_fail(desc, lnum, 0)) - return -EROFS; - return 0; + file->private_data = inode->i_private; + return nonseekable_open(inode, file); } /** - * ubifs_debugging_init - initialize UBIFS debugging. - * @c: UBIFS file-system description object + * provide_user_output - provide output to the user reading a debugfs file. + * @val: boolean value for the answer + * @u: the buffer to store the answer at + * @count: size of the buffer + * @ppos: position in the @u output buffer * - * This function initializes debugging-related data for the file system. - * Returns zero in case of success and a negative error code in case of + * This is a simple helper function which stores @val boolean value in the user + * buffer when the user reads one of UBIFS debugfs files. Returns amount of + * bytes written to @u in case of success and a negative error code in case of * failure. */ -int ubifs_debugging_init(struct ubifs_info *c) +static int provide_user_output(int val, char __user *u, size_t count, + loff_t *ppos) { - c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL); - if (!c->dbg) - return -ENOMEM; + char buf[3]; - failure_mode_init(c); - return 0; + if (val) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = 0x00; + + return simple_read_from_buffer(u, count, ppos, buf, 2); } -/** - * ubifs_debugging_exit - free debugging data. - * @c: UBIFS file-system description object - */ -void ubifs_debugging_exit(struct ubifs_info *c) +static ssize_t dfs_file_read(struct file *file, char __user *u, size_t count, + loff_t *ppos) { - failure_mode_exit(c); - kfree(c->dbg); -} + struct dentry *dent = file->f_path.dentry; + struct ubifs_info *c = file->private_data; + struct ubifs_debug_info *d = c->dbg; + int val; + + if (dent == d->dfs_chk_gen) + val = d->chk_gen; + else if (dent == d->dfs_chk_index) + val = d->chk_index; + else if (dent == d->dfs_chk_orph) + val = d->chk_orph; + else if (dent == d->dfs_chk_lprops) + val = d->chk_lprops; + else if (dent == d->dfs_chk_fs) + val = d->chk_fs; + else if (dent == d->dfs_tst_rcvry) + val = d->tst_rcvry; + else + return -EINVAL; -/* - * Root directory for UBIFS stuff in debugfs. Contains sub-directories which - * contain the stuff specific to particular file-system mounts. - */ -static struct dentry *dfs_rootdir; + return provide_user_output(val, u, count, ppos); +} /** - * dbg_debugfs_init - initialize debugfs file-system. + * interpret_user_input - interpret user debugfs file input. + * @u: user-provided buffer with the input + * @count: buffer size * - * UBIFS uses debugfs file-system to expose various debugging knobs to - * user-space. This function creates "ubifs" directory in the debugfs - * file-system. Returns zero in case of success and a negative error code in - * case of failure. + * This is a helper function which interpret user input to a boolean UBIFS + * debugfs file. Returns %0 or %1 in case of success and a negative error code + * in case of failure. */ -int dbg_debugfs_init(void) +static int interpret_user_input(const char __user *u, size_t count) { - dfs_rootdir = debugfs_create_dir("ubifs", NULL); - if (IS_ERR(dfs_rootdir)) { - int err = PTR_ERR(dfs_rootdir); - ubifs_err("cannot create \"ubifs\" debugfs directory, " - "error %d\n", err); - return err; - } + size_t buf_size; + char buf[8]; - return 0; -} + buf_size = min_t(size_t, count, (sizeof(buf) - 1)); + if (copy_from_user(buf, u, buf_size)) + return -EFAULT; -/** - * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system. - */ -void dbg_debugfs_exit(void) -{ - debugfs_remove(dfs_rootdir); -} + if (buf[0] == '1') + return 1; + else if (buf[0] == '0') + return 0; -static int open_debugfs_file(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return nonseekable_open(inode, file); + return -EINVAL; } -static ssize_t write_debugfs_file(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t dfs_file_write(struct file *file, const char __user *u, + size_t count, loff_t *ppos) { struct ubifs_info *c = file->private_data; struct ubifs_debug_info *d = c->dbg; + struct dentry *dent = file->f_path.dentry; + int val; - if (file->f_path.dentry == d->dfs_dump_lprops) + /* + * TODO: this is racy - the file-system might have already been + * unmounted and we'd oops in this case. The plan is to fix it with + * help of 'iterate_supers_type()' which we should have in v3.0: when + * a debugfs opened, we rember FS's UUID in file->private_data. Then + * whenever we access the FS via a debugfs file, we iterate all UBIFS + * superblocks and fine the one with the same UUID, and take the + * locking right. + * + * The other way to go suggested by Al Viro is to create a separate + * 'ubifs-debug' file-system instead. + */ + if (file->f_path.dentry == d->dfs_dump_lprops) { dbg_dump_lprops(c); - else if (file->f_path.dentry == d->dfs_dump_budg) + return count; + } + if (file->f_path.dentry == d->dfs_dump_budg) { dbg_dump_budg(c, &c->bi); - else if (file->f_path.dentry == d->dfs_dump_tnc) { + return count; + } + if (file->f_path.dentry == d->dfs_dump_tnc) { mutex_lock(&c->tnc_mutex); dbg_dump_tnc(c); mutex_unlock(&c->tnc_mutex); - } else + return count; + } + + val = interpret_user_input(u, count); + if (val < 0) + return val; + + if (dent == d->dfs_chk_gen) + d->chk_gen = val; + else if (dent == d->dfs_chk_index) + d->chk_index = val; + else if (dent == d->dfs_chk_orph) + d->chk_orph = val; + else if (dent == d->dfs_chk_lprops) + d->chk_lprops = val; + else if (dent == d->dfs_chk_fs) + d->chk_fs = val; + else if (dent == d->dfs_tst_rcvry) + d->tst_rcvry = val; + else return -EINVAL; return count; } static const struct file_operations dfs_fops = { - .open = open_debugfs_file, - .write = write_debugfs_file, + .open = dfs_file_open, + .read = dfs_file_read, + .write = dfs_file_write, .owner = THIS_MODULE, .llseek = no_llseek, }; @@ -2880,12 +2897,20 @@ static const struct file_operations dfs_fops = { */ int dbg_debugfs_init_fs(struct ubifs_info *c) { - int err; + int err, n; const char *fname; struct dentry *dent; struct ubifs_debug_info *d = c->dbg; - sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + c->vi.ubi_num, c->vi.vol_id); + if (n == UBIFS_DFS_DIR_LEN) { + /* The array size is too small */ + fname = UBIFS_DFS_DIR_NAME; + dent = ERR_PTR(-EINVAL); + goto out; + } + fname = d->dfs_dir_name; dent = debugfs_create_dir(fname, dfs_rootdir); if (IS_ERR_OR_NULL(dent)) @@ -2910,13 +2935,55 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) goto out_remove; d->dfs_dump_tnc = dent; + fname = "chk_general"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_gen = dent; + + fname = "chk_index"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_index = dent; + + fname = "chk_orphans"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_orph = dent; + + fname = "chk_lprops"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_lprops = dent; + + fname = "chk_fs"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_fs = dent; + + fname = "tst_recovery"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_tst_rcvry = dent; + return 0; out_remove: debugfs_remove_recursive(d->dfs_dir); out: err = dent ? PTR_ERR(dent) : -ENODEV; - ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", + ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n", fname, err); return err; } @@ -2930,4 +2997,179 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c) debugfs_remove_recursive(c->dbg->dfs_dir); } +struct ubifs_global_debug_info ubifs_dbg; + +static struct dentry *dfs_chk_gen; +static struct dentry *dfs_chk_index; +static struct dentry *dfs_chk_orph; +static struct dentry *dfs_chk_lprops; +static struct dentry *dfs_chk_fs; +static struct dentry *dfs_tst_rcvry; + +static ssize_t dfs_global_file_read(struct file *file, char __user *u, + size_t count, loff_t *ppos) +{ + struct dentry *dent = file->f_path.dentry; + int val; + + if (dent == dfs_chk_gen) + val = ubifs_dbg.chk_gen; + else if (dent == dfs_chk_index) + val = ubifs_dbg.chk_index; + else if (dent == dfs_chk_orph) + val = ubifs_dbg.chk_orph; + else if (dent == dfs_chk_lprops) + val = ubifs_dbg.chk_lprops; + else if (dent == dfs_chk_fs) + val = ubifs_dbg.chk_fs; + else if (dent == dfs_tst_rcvry) + val = ubifs_dbg.tst_rcvry; + else + return -EINVAL; + + return provide_user_output(val, u, count, ppos); +} + +static ssize_t dfs_global_file_write(struct file *file, const char __user *u, + size_t count, loff_t *ppos) +{ + struct dentry *dent = file->f_path.dentry; + int val; + + val = interpret_user_input(u, count); + if (val < 0) + return val; + + if (dent == dfs_chk_gen) + ubifs_dbg.chk_gen = val; + else if (dent == dfs_chk_index) + ubifs_dbg.chk_index = val; + else if (dent == dfs_chk_orph) + ubifs_dbg.chk_orph = val; + else if (dent == dfs_chk_lprops) + ubifs_dbg.chk_lprops = val; + else if (dent == dfs_chk_fs) + ubifs_dbg.chk_fs = val; + else if (dent == dfs_tst_rcvry) + ubifs_dbg.tst_rcvry = val; + else + return -EINVAL; + + return count; +} + +static const struct file_operations dfs_global_fops = { + .read = dfs_global_file_read, + .write = dfs_global_file_write, + .owner = THIS_MODULE, + .llseek = no_llseek, +}; + +/** + * dbg_debugfs_init - initialize debugfs file-system. + * + * UBIFS uses debugfs file-system to expose various debugging knobs to + * user-space. This function creates "ubifs" directory in the debugfs + * file-system. Returns zero in case of success and a negative error code in + * case of failure. + */ +int dbg_debugfs_init(void) +{ + int err; + const char *fname; + struct dentry *dent; + + fname = "ubifs"; + dent = debugfs_create_dir(fname, NULL); + if (IS_ERR_OR_NULL(dent)) + goto out; + dfs_rootdir = dent; + + fname = "chk_general"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_gen = dent; + + fname = "chk_index"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_index = dent; + + fname = "chk_orphans"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_orph = dent; + + fname = "chk_lprops"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_lprops = dent; + + fname = "chk_fs"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_fs = dent; + + fname = "tst_recovery"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_tst_rcvry = dent; + + return 0; + +out_remove: + debugfs_remove_recursive(dfs_rootdir); +out: + err = dent ? PTR_ERR(dent) : -ENODEV; + ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n", + fname, err); + return err; +} + +/** + * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system. + */ +void dbg_debugfs_exit(void) +{ + debugfs_remove_recursive(dfs_rootdir); +} + +/** + * ubifs_debugging_init - initialize UBIFS debugging. + * @c: UBIFS file-system description object + * + * This function initializes debugging-related data for the file system. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_debugging_init(struct ubifs_info *c) +{ + c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL); + if (!c->dbg) + return -ENOMEM; + + return 0; +} + +/** + * ubifs_debugging_exit - free debugging data. + * @c: UBIFS file-system description object + */ +void ubifs_debugging_exit(struct ubifs_info *c) +{ + kfree(c->dbg); +} + #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index a811ac4a26b..45174b53437 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -31,18 +31,25 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, #ifdef CONFIG_UBIFS_FS_DEBUG -#include <linux/random.h> +/* + * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi" + * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. + */ +#define UBIFS_DFS_DIR_NAME "ubi%d_%d" +#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) /** * ubifs_debug_info - per-FS debugging information. * @old_zroot: old index root - used by 'dbg_check_old_index()' * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' - * @failure_mode: failure mode for recovery testing - * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls - * @fail_timeout: time in jiffies when delay of failure mode expires - * @fail_cnt: current number of calls to failure mode I/O functions - * @fail_cnt_max: number of calls by which to delay failure mode + * + * @pc_happened: non-zero if an emulated power cut happened + * @pc_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls + * @pc_timeout: time in jiffies when delay of failure mode expires + * @pc_cnt: current number of calls to failure mode I/O functions + * @pc_cnt_max: number of calls by which to delay failure mode + * * @chk_lpt_sz: used by LPT tree size checker * @chk_lpt_sz2: used by LPT tree size checker * @chk_lpt_wastage: used by LPT tree size checker @@ -56,21 +63,36 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, * @saved_free: saved amount of free space * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt * + * @chk_gen: if general extra checks are enabled + * @chk_index: if index xtra checks are enabled + * @chk_orph: if orphans extra checks are enabled + * @chk_lprops: if lprops extra checks are enabled + * @chk_fs: if UBIFS contents extra checks are enabled + * @tst_rcvry: if UBIFS recovery testing mode enabled + * * @dfs_dir_name: name of debugfs directory containing this file-system's files * @dfs_dir: direntry object of the file-system debugfs directory * @dfs_dump_lprops: "dump lprops" debugfs knob * @dfs_dump_budg: "dump budgeting information" debugfs knob * @dfs_dump_tnc: "dump TNC" debugfs knob + * @dfs_chk_gen: debugfs knob to enable UBIFS general extra checks + * @dfs_chk_index: debugfs knob to enable UBIFS index extra checks + * @dfs_chk_orph: debugfs knob to enable UBIFS orphans extra checks + * @dfs_chk_lprops: debugfs knob to enable UBIFS LEP properties extra checks + * @dfs_chk_fs: debugfs knob to enable UBIFS contents extra checks + * @dfs_tst_rcvry: debugfs knob to enable UBIFS recovery testing */ struct ubifs_debug_info { struct ubifs_zbranch old_zroot; int old_zroot_level; unsigned long long old_zroot_sqnum; - int failure_mode; - int fail_delay; - unsigned long fail_timeout; - unsigned int fail_cnt; - unsigned int fail_cnt_max; + + int pc_happened; + int pc_delay; + unsigned long pc_timeout; + unsigned int pc_cnt; + unsigned int pc_cnt_max; + long long chk_lpt_sz; long long chk_lpt_sz2; long long chk_lpt_wastage; @@ -84,11 +106,43 @@ struct ubifs_debug_info { long long saved_free; int saved_idx_gc_cnt; - char dfs_dir_name[100]; + unsigned int chk_gen:1; + unsigned int chk_index:1; + unsigned int chk_orph:1; + unsigned int chk_lprops:1; + unsigned int chk_fs:1; + unsigned int tst_rcvry:1; + + char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1]; struct dentry *dfs_dir; struct dentry *dfs_dump_lprops; struct dentry *dfs_dump_budg; struct dentry *dfs_dump_tnc; + struct dentry *dfs_chk_gen; + struct dentry *dfs_chk_index; + struct dentry *dfs_chk_orph; + struct dentry *dfs_chk_lprops; + struct dentry *dfs_chk_fs; + struct dentry *dfs_tst_rcvry; +}; + +/** + * ubifs_global_debug_info - global (not per-FS) UBIFS debugging information. + * + * @chk_gen: if general extra checks are enabled + * @chk_index: if index xtra checks are enabled + * @chk_orph: if orphans extra checks are enabled + * @chk_lprops: if lprops extra checks are enabled + * @chk_fs: if UBIFS contents extra checks are enabled + * @tst_rcvry: if UBIFS recovery testing mode enabled + */ +struct ubifs_global_debug_info { + unsigned int chk_gen:1; + unsigned int chk_index:1; + unsigned int chk_orph:1; + unsigned int chk_lprops:1; + unsigned int chk_fs:1; + unsigned int tst_rcvry:1; }; #define ubifs_assert(expr) do { \ @@ -127,6 +181,8 @@ const char *dbg_key_str1(const struct ubifs_info *c, #define DBGKEY(key) dbg_key_str0(c, (key)) #define DBGKEY1(key) dbg_key_str1(c, (key)) +extern spinlock_t dbg_lock; + #define ubifs_dbg_msg(type, fmt, ...) do { \ spin_lock(&dbg_lock); \ pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \ @@ -162,41 +218,36 @@ const char *dbg_key_str1(const struct ubifs_info *c, /* Additional recovery messages */ #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) -/* - * Debugging check flags. - * - * UBIFS_CHK_GEN: general checks - * UBIFS_CHK_TNC: check TNC - * UBIFS_CHK_IDX_SZ: check index size - * UBIFS_CHK_ORPH: check orphans - * UBIFS_CHK_OLD_IDX: check the old index - * UBIFS_CHK_LPROPS: check lprops - * UBIFS_CHK_FS: check the file-system - */ -enum { - UBIFS_CHK_GEN = 0x1, - UBIFS_CHK_TNC = 0x2, - UBIFS_CHK_IDX_SZ = 0x4, - UBIFS_CHK_ORPH = 0x8, - UBIFS_CHK_OLD_IDX = 0x10, - UBIFS_CHK_LPROPS = 0x20, - UBIFS_CHK_FS = 0x40, -}; - -/* - * Special testing flags. - * - * UBIFS_TST_RCVRY: failure mode for recovery testing - */ -enum { - UBIFS_TST_RCVRY = 0x4, -}; - -extern spinlock_t dbg_lock; +extern struct ubifs_global_debug_info ubifs_dbg; -extern unsigned int ubifs_msg_flags; -extern unsigned int ubifs_chk_flags; -extern unsigned int ubifs_tst_flags; +static inline int dbg_is_chk_gen(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_gen || c->dbg->chk_gen); +} +static inline int dbg_is_chk_index(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_index || c->dbg->chk_index); +} +static inline int dbg_is_chk_orph(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_orph || c->dbg->chk_orph); +} +static inline int dbg_is_chk_lprops(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_lprops || c->dbg->chk_lprops); +} +static inline int dbg_is_chk_fs(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_fs || c->dbg->chk_fs); +} +static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.tst_rcvry || c->dbg->tst_rcvry); +} +static inline int dbg_is_power_cut(const struct ubifs_info *c) +{ + return !!c->dbg->pc_happened; +} int ubifs_debugging_init(struct ubifs_info *c); void ubifs_debugging_exit(struct ubifs_info *c); @@ -207,7 +258,7 @@ const char *dbg_cstate(int cmt_state); const char *dbg_jhead(int jhead); const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); -void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); +void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode); void dbg_dump_node(const struct ubifs_info *c, const void *node); void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, int offs); @@ -240,8 +291,8 @@ int dbg_check_cats(struct ubifs_info *c); int dbg_check_ltab(struct ubifs_info *c); int dbg_chk_lpt_free_spc(struct ubifs_info *c); int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len); -int dbg_check_synced_i_size(struct inode *inode); -int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); +int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode); +int dbg_check_dir(struct ubifs_info *c, const struct inode *dir); int dbg_check_tnc(struct ubifs_info *c, int extra); int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); int dbg_check_filesystem(struct ubifs_info *c); @@ -254,54 +305,12 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head); int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); -/* Force the use of in-the-gaps method for testing */ -static inline int dbg_force_in_the_gaps_enabled(void) -{ - return ubifs_chk_flags & UBIFS_CHK_GEN; -} -int dbg_force_in_the_gaps(void); - -/* Failure mode for recovery testing */ -#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) - -#ifndef UBIFS_DBG_PRESERVE_UBI -#define ubi_leb_read dbg_leb_read -#define ubi_leb_write dbg_leb_write -#define ubi_leb_change dbg_leb_change -#define ubi_leb_erase dbg_leb_erase -#define ubi_leb_unmap dbg_leb_unmap -#define ubi_is_mapped dbg_is_mapped -#define ubi_leb_map dbg_leb_map -#endif - -int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, - int len, int check); -int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, - int offset, int len, int dtype); -int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, - int len, int dtype); -int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum); -int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum); -int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum); -int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype); - -static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf, - int offset, int len) -{ - return dbg_leb_read(desc, lnum, buf, offset, len, 0); -} - -static inline int dbg_write(struct ubi_volume_desc *desc, int lnum, - const void *buf, int offset, int len) -{ - return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN); -} - -static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, - const void *buf, int len) -{ - return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); -} +int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len, int dtype); +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, + int dtype); +int dbg_leb_unmap(struct ubifs_info *c, int lnum); +int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype); /* Debugfs-related stuff */ int dbg_debugfs_init(void); @@ -313,7 +322,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); /* Use "if (0)" to make compiler check arguments even if debugging is off */ #define ubifs_assert(expr) do { \ - if (0 && (expr)) \ + if (0) \ printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ __func__, __LINE__, current->pid); \ } while (0) @@ -323,6 +332,9 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); ubifs_err(fmt, ##__VA_ARGS__); \ } while (0) +#define DBGKEY(key) ((char *)(key)) +#define DBGKEY1(key) ((char *)(key)) + #define ubifs_dbg_msg(fmt, ...) do { \ if (0) \ pr_debug(fmt "\n", ##__VA_ARGS__); \ @@ -346,9 +358,6 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) #define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define DBGKEY(key) ((char *)(key)) -#define DBGKEY1(key) ((char *)(key)) - static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } static inline const char *dbg_ntype(int type) { return ""; } @@ -357,7 +366,7 @@ static inline const char *dbg_jhead(int jhead) { return ""; } static inline const char * dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key) { return ""; } -static inline void dbg_dump_inode(const struct ubifs_info *c, +static inline void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) { return; } static inline void dbg_dump_node(const struct ubifs_info *c, const void *node) { return; } @@ -409,9 +418,11 @@ static inline int dbg_check_ltab(struct ubifs_info *c) { return 0; } static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c) { return 0; } static inline int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) { return 0; } -static inline int dbg_check_synced_i_size(struct inode *inode) { return 0; } -static inline int dbg_check_dir_size(struct ubifs_info *c, - const struct inode *dir) { return 0; } +static inline int +dbg_check_synced_i_size(const struct ubifs_info *c, + struct inode *inode) { return 0; } +static inline int dbg_check_dir(struct ubifs_info *c, + const struct inode *dir) { return 0; } static inline int dbg_check_tnc(struct ubifs_info *c, int extra) { return 0; } static inline int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) { return 0; } @@ -431,9 +442,23 @@ static inline int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) { return 0; } -static inline int dbg_force_in_the_gaps(void) { return 0; } -#define dbg_force_in_the_gaps_enabled() 0 -#define dbg_failure_mode 0 +static inline int dbg_leb_write(struct ubifs_info *c, int lnum, + const void *buf, int offset, + int len, int dtype) { return 0; } +static inline int dbg_leb_change(struct ubifs_info *c, int lnum, + const void *buf, int len, + int dtype) { return 0; } +static inline int dbg_leb_unmap(struct ubifs_info *c, int lnum) { return 0; } +static inline int dbg_leb_map(struct ubifs_info *c, int lnum, + int dtype) { return 0; } + +static inline int dbg_is_chk_gen(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_chk_index(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_chk_orph(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_chk_lprops(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_chk_fs(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) { return 0; } +static inline int dbg_is_power_cut(const struct ubifs_info *c) { return 0; } static inline int dbg_debugfs_init(void) { return 0; } static inline void dbg_debugfs_exit(void) { return; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ef5abd38f0b..68349204331 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -102,7 +102,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, * UBIFS has to fully control "clean <-> dirty" transitions of inodes * to make budgeting work. */ - inode->i_flags |= (S_NOCMTIME); + inode->i_flags |= S_NOCMTIME; inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = @@ -172,9 +172,11 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, #ifdef CONFIG_UBIFS_FS_DEBUG -static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm) +static int dbg_check_name(const struct ubifs_info *c, + const struct ubifs_dent_node *dent, + const struct qstr *nm) { - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (le16_to_cpu(dent->nlen) != nm->len) return -EINVAL; @@ -185,7 +187,7 @@ static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm) #else -#define dbg_check_name(dent, nm) 0 +#define dbg_check_name(c, dent, nm) 0 #endif @@ -219,7 +221,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - if (dbg_check_name(dent, &dentry->d_name)) { + if (dbg_check_name(c, dent, &dentry->d_name)) { err = -EINVAL; goto out; } @@ -522,7 +524,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - err = dbg_check_synced_i_size(inode); + err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -577,7 +579,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) inode->i_nlink, dir->i_ino); ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - err = dbg_check_synced_i_size(inode); + err = dbg_check_synced_i_size(c, inode); if (err) return err; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5e7fccfc4b2..7cf738a4544 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1263,7 +1263,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - err = dbg_check_synced_i_size(inode); + err = dbg_check_synced_i_size(c, inode); if (err) return err; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 3be645e012c..9228950a658 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -86,8 +86,125 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) c->no_chk_data_crc = 0; c->vfs_sb->s_flags |= MS_RDONLY; ubifs_warn("switched to read-only mode, error %d", err); + dump_stack(); + } +} + +/* + * Below are simple wrappers over UBI I/O functions which include some + * additional checks and UBIFS debugging stuff. See corresponding UBI function + * for more information. + */ + +int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, + int len, int even_ebadmsg) +{ + int err; + + err = ubi_read(c->ubi, lnum, buf, offs, len); + /* + * In case of %-EBADMSG print the error message only if the + * @even_ebadmsg is true. + */ + if (err && (err != -EBADMSG || even_ebadmsg)) { + ubifs_err("reading %d bytes from LEB %d:%d failed, error %d", + len, lnum, offs, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len, int dtype) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); + else + err = dbg_leb_write(c, lnum, buf, offs, len, dtype); + if (err) { + ubifs_err("writing %d bytes to LEB %d:%d failed, error %d", + len, lnum, offs, err); + ubifs_ro_mode(c, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, + int dtype) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); + else + err = dbg_leb_change(c, lnum, buf, len, dtype); + if (err) { + ubifs_err("changing %d bytes in LEB %d failed, error %d", + len, lnum, err); + ubifs_ro_mode(c, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_leb_unmap(struct ubifs_info *c, int lnum) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_unmap(c->ubi, lnum); + else + err = dbg_leb_unmap(c, lnum); + if (err) { + ubifs_err("unmap LEB %d failed, error %d", lnum, err); + ubifs_ro_mode(c, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_map(c->ubi, lnum, dtype); + else + err = dbg_leb_map(c, lnum, dtype); + if (err) { + ubifs_err("mapping LEB %d failed, error %d", lnum, err); + ubifs_ro_mode(c, err); + dbg_dump_stack(); + } + return err; +} + +int ubifs_is_mapped(const struct ubifs_info *c, int lnum) +{ + int err; + + err = ubi_is_mapped(c->ubi, lnum); + if (err < 0) { + ubifs_err("ubi_is_mapped failed for LEB %d, error %d", + lnum, err); dbg_dump_stack(); } + return err; } /** @@ -406,14 +523,10 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) dirt = sync_len - wbuf->used; if (dirt) ubifs_pad(c, wbuf->buf + wbuf->used, dirt); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - sync_len, wbuf->dtype); - if (err) { - ubifs_err("cannot write %d bytes to LEB %d:%d", - sync_len, wbuf->lnum, wbuf->offs); - dbg_dump_stack(); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len, + wbuf->dtype); + if (err) return err; - } spin_lock(&wbuf->lock); wbuf->offs += sync_len; @@ -605,9 +718,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (aligned_len == wbuf->avail) { dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, - wbuf->offs, wbuf->size, - wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, + wbuf->offs, wbuf->size, + wbuf->dtype); if (err) goto out; @@ -642,8 +755,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - wbuf->size, wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, + wbuf->size, wbuf->dtype); if (err) goto out; @@ -661,8 +774,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ dbg_io("write %d bytes to LEB %d:%d", wbuf->size, wbuf->lnum, wbuf->offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs, - wbuf->size, wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs, + wbuf->size, wbuf->dtype); if (err) goto out; @@ -683,8 +796,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) n <<= c->max_write_shift; dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, - wbuf->offs, n, wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, n, wbuf->dtype); if (err) goto out; wbuf->offs += n; @@ -766,13 +879,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, return -EROFS; ubifs_prepare_node(c, buf, len, 1); - err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype); - if (err) { - ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", - buf_len, lnum, offs, err); + err = ubifs_leb_write(c, lnum, buf, offs, buf_len, dtype); + if (err) dbg_dump_node(c, buf); - dbg_dump_stack(); - } return err; } @@ -824,13 +933,9 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, if (rlen > 0) { /* Read everything that goes before write-buffer */ - err = ubi_read(c->ubi, lnum, buf, offs, rlen); - if (err && err != -EBADMSG) { - ubifs_err("failed to read node %d from LEB %d:%d, " - "error %d", type, lnum, offs, err); - dbg_dump_stack(); + err = ubifs_leb_read(c, lnum, buf, offs, rlen, 0); + if (err && err != -EBADMSG) return err; - } } if (type != ch->node_type) { @@ -885,12 +990,9 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, ubifs_assert(!(offs & 7) && offs < c->leb_size); ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); - err = ubi_read(c->ubi, lnum, buf, offs, len); - if (err && err != -EBADMSG) { - ubifs_err("cannot read node %d from LEB %d:%d, error %d", - type, lnum, offs, err); + err = ubifs_leb_read(c, lnum, buf, offs, len, 0); + if (err && err != -EBADMSG) return err; - } if (type != ch->node_type) { ubifs_err("bad node type (%d but expected %d)", diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index affea9494ae..f9fd068d1ae 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -262,7 +262,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) * an unclean reboot, because the target LEB might have been * unmapped, but not yet physically erased. */ - err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM); + err = ubifs_leb_map(c, bud->lnum, UBI_SHORTTERM); if (err) goto out_unlock; } @@ -283,8 +283,6 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) return 0; out_unlock: - if (err != -EAGAIN) - ubifs_ro_mode(c, err); mutex_unlock(&c->log_mutex); kfree(ref); kfree(bud); @@ -752,7 +750,7 @@ static int dbg_check_bud_bytes(struct ubifs_info *c) struct ubifs_bud *bud; long long bud_bytes = 0; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; spin_lock(&c->buds_lock); diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 667884f4a61..f8a181e647c 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -504,7 +504,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) pnode = (struct ubifs_pnode *)container_of(lprops - pos, struct ubifs_pnode, lprops[0]); - return !test_bit(COW_ZNODE, &pnode->flags) && + return !test_bit(COW_CNODE, &pnode->flags) && test_bit(DIRTY_CNODE, &pnode->flags); } @@ -860,7 +860,7 @@ int dbg_check_cats(struct ubifs_info *c) struct list_head *pos; int i, cat; - if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c)) return 0; list_for_each_entry(lprops, &c->empty_list, list) { @@ -958,7 +958,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, { int i = 0, j, err = 0; - if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c)) return; for (i = 0; i < heap->cnt; i++) { @@ -1262,7 +1262,7 @@ int dbg_check_lprops(struct ubifs_info *c) int i, err; struct ubifs_lp_stats lst; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; /* diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index ef5155e109a..6189c74d97f 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -701,8 +701,8 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen, + UBI_SHORTTERM); if (err) goto out; p = buf; @@ -732,8 +732,8 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen, + UBI_SHORTTERM); if (err) goto out; p = buf; @@ -780,8 +780,8 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen, + UBI_SHORTTERM); if (err) goto out; p = buf; @@ -806,7 +806,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen, UBI_SHORTTERM); if (err) goto out; p = buf; @@ -826,7 +826,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, /* Write remaining buffer */ memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, buf, alen, UBI_SHORTTERM); if (err) goto out; @@ -1222,7 +1222,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) if (c->big_lpt) nnode->num = calc_nnode_num_from_parent(c, parent, iip); } else { - err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz); + err = ubifs_leb_read(c, lnum, buf, offs, c->nnode_sz, 1); if (err) goto out; err = ubifs_unpack_nnode(c, buf, nnode); @@ -1247,6 +1247,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs); + dbg_dump_stack(); kfree(nnode); return err; } @@ -1290,7 +1291,7 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) lprops->flags = ubifs_categorize_lprops(c, lprops); } } else { - err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz); + err = ubifs_leb_read(c, lnum, buf, offs, c->pnode_sz, 1); if (err) goto out; err = unpack_pnode(c, buf, pnode); @@ -1312,6 +1313,7 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); dbg_dump_pnode(c, pnode, parent, iip); + dbg_dump_stack(); dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); kfree(pnode); return err; @@ -1331,7 +1333,7 @@ static int read_ltab(struct ubifs_info *c) buf = vmalloc(c->ltab_sz); if (!buf) return -ENOMEM; - err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz); + err = ubifs_leb_read(c, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz, 1); if (err) goto out; err = unpack_ltab(c, buf); @@ -1354,7 +1356,8 @@ static int read_lsave(struct ubifs_info *c) buf = vmalloc(c->lsave_sz); if (!buf) return -ENOMEM; - err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz); + err = ubifs_leb_read(c, c->lsave_lnum, buf, c->lsave_offs, + c->lsave_sz, 1); if (err) goto out; err = unpack_lsave(c, buf); @@ -1814,8 +1817,8 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c, if (c->big_lpt) nnode->num = calc_nnode_num_from_parent(c, parent, iip); } else { - err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, - c->nnode_sz); + err = ubifs_leb_read(c, branch->lnum, buf, branch->offs, + c->nnode_sz, 1); if (err) return ERR_PTR(err); err = ubifs_unpack_nnode(c, buf, nnode); @@ -1883,8 +1886,8 @@ static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c, ubifs_assert(branch->lnum >= c->lpt_first && branch->lnum <= c->lpt_last); ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size); - err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, - c->pnode_sz); + err = ubifs_leb_read(c, branch->lnum, buf, branch->offs, + c->pnode_sz, 1); if (err) return ERR_PTR(err); err = unpack_pnode(c, buf, pnode); @@ -2224,7 +2227,7 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, struct ubifs_cnode *cn; int num, iip = 0, err; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; while (cnode) { diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index dfcb5748a7d..cddd6bd214f 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -27,6 +27,7 @@ #include <linux/crc16.h> #include <linux/slab.h> +#include <linux/random.h> #include "ubifs.h" #ifdef CONFIG_UBIFS_FS_DEBUG @@ -116,8 +117,8 @@ static int get_cnodes_to_commit(struct ubifs_info *c) return 0; cnt += 1; while (1) { - ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags)); - __set_bit(COW_ZNODE, &cnode->flags); + ubifs_assert(!test_bit(COW_CNODE, &cnode->flags)); + __set_bit(COW_CNODE, &cnode->flags); cnext = next_dirty_cnode(cnode); if (!cnext) { cnode->cnext = c->lpt_cnext; @@ -465,7 +466,7 @@ static int write_cnodes(struct ubifs_info *c) */ clear_bit(DIRTY_CNODE, &cnode->flags); smp_mb__before_clear_bit(); - clear_bit(COW_ZNODE, &cnode->flags); + clear_bit(COW_CNODE, &cnode->flags); smp_mb__after_clear_bit(); offs += len; dbg_chk_lpt_sz(c, 1, len); @@ -1160,11 +1161,11 @@ static int lpt_gc_lnum(struct ubifs_info *c, int lnum) void *buf = c->lpt_buf; dbg_lp("LEB %d", lnum); - err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); - if (err) { - ubifs_err("cannot read LEB %d, error %d", lnum, err); + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) return err; - } + while (1) { if (!is_a_node(c, buf, len)) { int pad_len; @@ -1640,7 +1641,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) int ret; void *buf, *p; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); @@ -1650,11 +1651,11 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) } dbg_lp("LEB %d", lnum); - err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); - if (err) { - dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) goto out; - } + while (1) { if (!is_a_node(c, p, len)) { int i, pad_len; @@ -1711,7 +1712,7 @@ int dbg_check_ltab(struct ubifs_info *c) { int lnum, err, i, cnt; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; /* Bring the entire tree into memory */ @@ -1754,7 +1755,7 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) long long free = 0; int i; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; for (i = 0; i < c->lpt_lebs; i++) { @@ -1796,7 +1797,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) long long chk_lpt_sz, lpt_sz; int err = 0; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; switch (action) { @@ -1901,11 +1902,10 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) return; } - err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); - if (err) { - ubifs_err("cannot read LEB %d, error %d", lnum, err); + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) goto out; - } + while (1) { offs = c->leb_size - len; if (!is_a_node(c, p, len)) { @@ -2019,7 +2019,7 @@ static int dbg_populate_lsave(struct ubifs_info *c) struct ubifs_lpt_heap *heap; int i; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (random32() & 3) return 0; diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 0b5296a9a4c..ee7cb5ebb6e 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -39,6 +39,29 @@ static inline int ubifs_zn_dirty(const struct ubifs_znode *znode) } /** + * ubifs_zn_obsolete - check if znode is obsolete. + * @znode: znode to check + * + * This helper function returns %1 if @znode is obsolete and %0 otherwise. + */ +static inline int ubifs_zn_obsolete(const struct ubifs_znode *znode) +{ + return !!test_bit(OBSOLETE_ZNODE, &znode->flags); +} + +/** + * ubifs_zn_cow - check if znode has to be copied on write. + * @znode: znode to check + * + * This helper function returns %1 if @znode is has COW flag set and %0 + * otherwise. + */ +static inline int ubifs_zn_cow(const struct ubifs_znode *znode) +{ + return !!test_bit(COW_ZNODE, &znode->flags); +} + +/** * ubifs_wake_up_bgt - wake up background thread. * @c: UBIFS file-system description object */ @@ -122,86 +145,6 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf) } /** - * ubifs_leb_unmap - unmap an LEB. - * @c: UBIFS file-system description object - * @lnum: LEB number to unmap - * - * This function returns %0 on success and a negative error code on failure. - */ -static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum) -{ - int err; - - ubifs_assert(!c->ro_media && !c->ro_mount); - if (c->ro_error) - return -EROFS; - err = ubi_leb_unmap(c->ubi, lnum); - if (err) { - ubifs_err("unmap LEB %d failed, error %d", lnum, err); - return err; - } - - return 0; -} - -/** - * ubifs_leb_write - write to a LEB. - * @c: UBIFS file-system description object - * @lnum: LEB number to write - * @buf: buffer to write from - * @offs: offset within LEB to write to - * @len: length to write - * @dtype: data type - * - * This function returns %0 on success and a negative error code on failure. - */ -static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum, - const void *buf, int offs, int len, int dtype) -{ - int err; - - ubifs_assert(!c->ro_media && !c->ro_mount); - if (c->ro_error) - return -EROFS; - err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); - if (err) { - ubifs_err("writing %d bytes at %d:%d, error %d", - len, lnum, offs, err); - return err; - } - - return 0; -} - -/** - * ubifs_leb_change - atomic LEB change. - * @c: UBIFS file-system description object - * @lnum: LEB number to write - * @buf: buffer to write from - * @len: length to write - * @dtype: data type - * - * This function returns %0 on success and a negative error code on failure. - */ -static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum, - const void *buf, int len, int dtype) -{ - int err; - - ubifs_assert(!c->ro_media && !c->ro_mount); - if (c->ro_error) - return -EROFS; - err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); - if (err) { - ubifs_err("changing %d bytes in LEB %d, error %d", - len, lnum, err); - return err; - } - - return 0; -} - -/** * ubifs_encode_dev - encode device node IDs. * @dev: UBIFS device node information * @rdev: device IDs to encode diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index a5422fffbd6..c542c73cfa3 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -929,7 +929,7 @@ static int dbg_check_orphans(struct ubifs_info *c) struct check_info ci; int err; - if (!(ubifs_chk_flags & UBIFS_CHK_ORPH)) + if (!dbg_is_chk_orph(c)) return 0; ci.last_ino = 0; diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 783d8e0beb7..af02790d932 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -117,7 +117,7 @@ static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf, if (!sbuf) return -ENOMEM; - err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size); + err = ubifs_leb_read(c, lnum, sbuf, 0, c->leb_size, 0); if (err && err != -EBADMSG) goto out_free; @@ -213,10 +213,10 @@ static int write_rcvrd_mst_node(struct ubifs_info *c, mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY); ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); - err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, mst, sz, UBI_SHORTTERM); if (err) goto out; - err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum + 1, mst, sz, UBI_SHORTTERM); if (err) goto out; out: @@ -274,7 +274,8 @@ int ubifs_recover_master_node(struct ubifs_info *c) if (cor1) goto out_err; mst = mst1; - } else if (offs1 == 0 && offs2 + sz >= c->leb_size) { + } else if (offs1 == 0 && + c->leb_size - offs2 - sz < sz) { /* 1st LEB was unmapped and written, 2nd not */ if (cor1) goto out_err; @@ -539,8 +540,8 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, int len = ALIGN(endpt, c->min_io_size); if (start) { - err = ubi_read(c->ubi, lnum, sleb->buf, 0, - start); + err = ubifs_leb_read(c, lnum, sleb->buf, 0, + start, 1); if (err) return err; } @@ -554,8 +555,8 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ubifs_pad(c, buf, pad_len); } } - err = ubi_leb_change(c->ubi, lnum, sleb->buf, len, - UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sleb->buf, len, + UBI_UNKNOWN); if (err) return err; } @@ -819,7 +820,8 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, return -ENOMEM; if (c->leb_size - offs < UBIFS_CS_NODE_SZ) goto out_err; - err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ); + err = ubifs_leb_read(c, lnum, (void *)cs_node, offs, + UBIFS_CS_NODE_SZ, 0); if (err && err != -EBADMSG) goto out_free; ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); @@ -919,8 +921,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, * * This function returns %0 on success and a negative error code on failure. */ -static int recover_head(const struct ubifs_info *c, int lnum, int offs, - void *sbuf) +static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf) { int len = c->max_write_size, err; @@ -931,15 +932,15 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs, return 0; /* Read at the head location and check it is empty flash */ - err = ubi_read(c->ubi, lnum, sbuf, offs, len); + err = ubifs_leb_read(c, lnum, sbuf, offs, len, 1); if (err || !is_empty(sbuf, len)) { dbg_rcvry("cleaning head at %d:%d", lnum, offs); if (offs == 0) return ubifs_leb_unmap(c, lnum); - err = ubi_read(c->ubi, lnum, sbuf, 0, offs); + err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1); if (err) return err; - return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN); + return ubifs_leb_change(c, lnum, sbuf, offs, UBI_UNKNOWN); } return 0; @@ -962,7 +963,7 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs, * * This function returns %0 on success and a negative error code on failure. */ -int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) +int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) { int err; @@ -993,7 +994,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) * * This function returns %0 on success and a negative error code on failure. */ -static int clean_an_unclean_leb(const struct ubifs_info *c, +static int clean_an_unclean_leb(struct ubifs_info *c, struct ubifs_unclean_leb *ucleb, void *sbuf) { int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1; @@ -1009,7 +1010,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c, return 0; } - err = ubi_read(c->ubi, lnum, buf, offs, len); + err = ubifs_leb_read(c, lnum, buf, offs, len, 0); if (err && err != -EBADMSG) return err; @@ -1069,7 +1070,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c, } /* Write back the LEB atomically */ - err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sbuf, len, UBI_UNKNOWN); if (err) return err; @@ -1089,7 +1090,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c, * * This function returns %0 on success and a negative error code on failure. */ -int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf) +int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf) { dbg_rcvry("recovery"); while (!list_empty(&c->unclean_leb_list)) { @@ -1454,7 +1455,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) if (i_size >= e->d_size) return 0; /* Read the LEB */ - err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size); + err = ubifs_leb_read(c, lnum, c->sbuf, 0, c->leb_size, 1); if (err) goto out; /* Change the size field and recalculate the CRC */ @@ -1470,7 +1471,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) len -= 1; len = ALIGN(len + 1, c->min_io_size); /* Atomically write the fixed LEB back again */ - err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN); if (err) goto out; dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 5e97161ce4d..ccabaf1164b 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -523,8 +523,7 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) if (!list_is_last(&next->list, &jh->buds_list)) return 0; - err = ubi_read(c->ubi, next->lnum, (char *)&data, - next->start, 4); + err = ubifs_leb_read(c, next->lnum, (char *)&data, next->start, 4, 1); if (err) return 0; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index c606f010e8d..93d938ad3d2 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -674,15 +674,15 @@ static int fixup_leb(struct ubifs_info *c, int lnum, int len) if (len == 0) { dbg_mnt("unmap empty LEB %d", lnum); - return ubi_leb_unmap(c->ubi, lnum); + return ubifs_leb_unmap(c, lnum); } dbg_mnt("fixup LEB %d, data len %d", lnum, len); - err = ubi_read(c->ubi, lnum, c->sbuf, 0, len); + err = ubifs_leb_read(c, lnum, c->sbuf, 0, len, 1); if (err) return err; - return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); + return ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN); } /** diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 36216b46f77..37383e8011b 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -148,7 +148,7 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, INIT_LIST_HEAD(&sleb->nodes); sleb->buf = sbuf; - err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs); + err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0); if (err && err != -EBADMSG) { ubifs_err("cannot read %d bytes from LEB %d:%d," " error %d", c->leb_size - offs, lnum, offs, err); @@ -240,7 +240,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, int len; ubifs_err("corruption at LEB %d:%d", lnum, offs); - if (dbg_failure_mode) + if (dbg_is_tst_rcvry(c)) return; len = c->leb_size - offs; if (len > 8192) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b5aeb5a8ebe..b28121278d4 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -85,7 +85,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) return 4; - if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG) + if (ui->xattr && !S_ISREG(inode->i_mode)) return 5; if (!ubifs_compr_present(ui->compr_type)) { @@ -94,7 +94,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) ubifs_compr_name(ui->compr_type)); } - err = dbg_check_dir_size(c, inode); + err = dbg_check_dir(c, inode); return err; } @@ -914,7 +914,7 @@ static int check_volume_empty(struct ubifs_info *c) c->empty = 1; for (lnum = 0; lnum < c->leb_cnt; lnum++) { - err = ubi_is_mapped(c->ubi, lnum); + err = ubifs_is_mapped(c, lnum); if (unlikely(err < 0)) return err; if (err == 1) { @@ -1848,7 +1848,6 @@ static void ubifs_put_super(struct super_block *sb) bdi_destroy(&c->bdi); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); - kfree(c); } static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) @@ -1971,61 +1970,65 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) return ERR_PTR(-EINVAL); } -static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) { - struct ubi_volume_desc *ubi = sb->s_fs_info; struct ubifs_info *c; - struct inode *root; - int err; c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); - if (!c) - return -ENOMEM; + if (c) { + spin_lock_init(&c->cnt_lock); + spin_lock_init(&c->cs_lock); + spin_lock_init(&c->buds_lock); + spin_lock_init(&c->space_lock); + spin_lock_init(&c->orphan_lock); + init_rwsem(&c->commit_sem); + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); + mutex_init(&c->mst_mutex); + mutex_init(&c->umount_mutex); + mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); + init_waitqueue_head(&c->cmt_wq); + c->buds = RB_ROOT; + c->old_idx = RB_ROOT; + c->size_tree = RB_ROOT; + c->orph_tree = RB_ROOT; + INIT_LIST_HEAD(&c->infos_list); + INIT_LIST_HEAD(&c->idx_gc); + INIT_LIST_HEAD(&c->replay_list); + INIT_LIST_HEAD(&c->replay_buds); + INIT_LIST_HEAD(&c->uncat_list); + INIT_LIST_HEAD(&c->empty_list); + INIT_LIST_HEAD(&c->freeable_list); + INIT_LIST_HEAD(&c->frdi_idx_list); + INIT_LIST_HEAD(&c->unclean_leb_list); + INIT_LIST_HEAD(&c->old_buds); + INIT_LIST_HEAD(&c->orph_list); + INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; + + c->highest_inum = UBIFS_FIRST_INO; + c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; + + ubi_get_volume_info(ubi, &c->vi); + ubi_get_device_info(c->vi.ubi_num, &c->di); + } + return c; +} - spin_lock_init(&c->cnt_lock); - spin_lock_init(&c->cs_lock); - spin_lock_init(&c->buds_lock); - spin_lock_init(&c->space_lock); - spin_lock_init(&c->orphan_lock); - init_rwsem(&c->commit_sem); - mutex_init(&c->lp_mutex); - mutex_init(&c->tnc_mutex); - mutex_init(&c->log_mutex); - mutex_init(&c->mst_mutex); - mutex_init(&c->umount_mutex); - mutex_init(&c->bu_mutex); - mutex_init(&c->write_reserve_mutex); - init_waitqueue_head(&c->cmt_wq); - c->buds = RB_ROOT; - c->old_idx = RB_ROOT; - c->size_tree = RB_ROOT; - c->orph_tree = RB_ROOT; - INIT_LIST_HEAD(&c->infos_list); - INIT_LIST_HEAD(&c->idx_gc); - INIT_LIST_HEAD(&c->replay_list); - INIT_LIST_HEAD(&c->replay_buds); - INIT_LIST_HEAD(&c->uncat_list); - INIT_LIST_HEAD(&c->empty_list); - INIT_LIST_HEAD(&c->freeable_list); - INIT_LIST_HEAD(&c->frdi_idx_list); - INIT_LIST_HEAD(&c->unclean_leb_list); - INIT_LIST_HEAD(&c->old_buds); - INIT_LIST_HEAD(&c->orph_list); - INIT_LIST_HEAD(&c->orph_new); - c->no_chk_data_crc = 1; +static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ubifs_info *c = sb->s_fs_info; + struct inode *root; + int err; c->vfs_sb = sb; - c->highest_inum = UBIFS_FIRST_INO; - c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; - - ubi_get_volume_info(ubi, &c->vi); - ubi_get_device_info(c->vi.ubi_num, &c->di); - /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { err = PTR_ERR(c->ubi); - goto out_free; + goto out; } /* @@ -2091,24 +2094,29 @@ out_bdi: bdi_destroy(&c->bdi); out_close: ubi_close_volume(c->ubi); -out_free: - kfree(c); +out: return err; } static int sb_test(struct super_block *sb, void *data) { - dev_t *dev = data; + struct ubifs_info *c1 = data; struct ubifs_info *c = sb->s_fs_info; - return c->vi.cdev == *dev; + return c->vi.cdev == c1->vi.cdev; +} + +static int sb_set(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); } static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, const char *name, void *data) { struct ubi_volume_desc *ubi; - struct ubi_volume_info vi; + struct ubifs_info *c; struct super_block *sb; int err; @@ -2125,19 +2133,25 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } - ubi_get_volume_info(ubi, &vi); - dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); + c = alloc_ubifs_info(ubi); + if (!c) { + err = -ENOMEM; + goto out_close; + } + + dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); - sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev); + sb = sget(fs_type, sb_test, sb_set, c); if (IS_ERR(sb)) { err = PTR_ERR(sb); + kfree(c); goto out_close; } if (sb->s_root) { struct ubifs_info *c1 = sb->s_fs_info; - + kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); if (!!(flags & MS_RDONLY) != c1->ro_mount) { @@ -2146,11 +2160,6 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, } } else { sb->s_flags = flags; - /* - * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is - * replaced by 'c'. - */ - sb->s_fs_info = ubi; err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) goto out_deact; @@ -2170,11 +2179,18 @@ out_close: return ERR_PTR(err); } +static void kill_ubifs_super(struct super_block *s) +{ + struct ubifs_info *c = s->s_fs_info; + kill_anon_super(s); + kfree(c); +} + static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, .mount = ubifs_mount, - .kill_sb = kill_anon_super, + .kill_sb = kill_ubifs_super, }; /* diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 91b4213dde8..06673864768 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -223,7 +223,7 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c, __set_bit(DIRTY_ZNODE, &zn->flags); __clear_bit(COW_ZNODE, &zn->flags); - ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + ubifs_assert(!ubifs_zn_obsolete(znode)); __set_bit(OBSOLETE_ZNODE, &znode->flags); if (znode->level != 0) { @@ -271,7 +271,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, struct ubifs_znode *zn; int err; - if (!test_bit(COW_ZNODE, &znode->flags)) { + if (!ubifs_zn_cow(znode)) { /* znode is not being committed */ if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) { atomic_long_inc(&c->dirty_zn_cnt); @@ -462,7 +462,7 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); - err = ubi_read(c->ubi, lnum, buf, offs, len); + err = ubifs_leb_read(c, lnum, buf, offs, len, 1); if (err) { ubifs_err("cannot read node type %d from LEB %d:%d, error %d", type, lnum, offs, err); @@ -1666,7 +1666,7 @@ static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, if (!overlap) { /* We may safely unlock the write-buffer and read the data */ spin_unlock(&wbuf->lock); - return ubi_read(c->ubi, lnum, buf, offs, len); + return ubifs_leb_read(c, lnum, buf, offs, len, 0); } /* Don't read under wbuf */ @@ -1680,7 +1680,7 @@ static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, if (rlen > 0) /* Read everything that goes before write-buffer */ - return ubi_read(c->ubi, lnum, buf, offs, rlen); + return ubifs_leb_read(c, lnum, buf, offs, rlen, 0); return 0; } @@ -1767,7 +1767,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) if (wbuf) err = read_wbuf(wbuf, bu->buf, len, lnum, offs); else - err = ubi_read(c->ubi, lnum, bu->buf, offs, len); + err = ubifs_leb_read(c, lnum, bu->buf, offs, len, 0); /* Check for a race with GC */ if (maybe_leb_gced(c, lnum, bu->gc_seq)) @@ -2423,7 +2423,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) */ do { - ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + ubifs_assert(!ubifs_zn_obsolete(znode)); ubifs_assert(ubifs_zn_dirty(znode)); zp = znode->parent; @@ -2479,9 +2479,8 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) c->zroot.offs = zbr->offs; c->zroot.len = zbr->len; c->zroot.znode = znode; - ubifs_assert(!test_bit(OBSOLETE_ZNODE, - &zp->flags)); - ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags)); + ubifs_assert(!ubifs_zn_obsolete(zp)); + ubifs_assert(ubifs_zn_dirty(zp)); atomic_long_dec(&c->dirty_zn_cnt); if (zp->cnext) { @@ -2865,7 +2864,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c) struct ubifs_znode *znode = cnext; cnext = cnext->cnext; - if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + if (ubifs_zn_obsolete(znode)) kfree(znode); } while (cnext && cnext != c->cnext); } @@ -3301,7 +3300,7 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, if (!S_ISREG(inode->i_mode)) return 0; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; @@ -3337,9 +3336,10 @@ out_dump: ubifs_err("inode %lu has size %lld, but there are data at offset %lld " "(data key %s)", (unsigned long)inode->i_ino, size, ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key)); + mutex_unlock(&c->tnc_mutex); dbg_dump_inode(c, inode); dbg_dump_stack(); - err = -EINVAL; + return -EINVAL; out_unlock: mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 41920f357bb..4c15f07a8bb 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -22,6 +22,7 @@ /* This file implements TNC functions for committing */ +#include <linux/random.h> #include "ubifs.h" /** @@ -87,8 +88,12 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, atomic_long_dec(&c->dirty_zn_cnt); ubifs_assert(ubifs_zn_dirty(znode)); - ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + ubifs_assert(ubifs_zn_cow(znode)); + /* + * Note, unlike 'write_index()' we do not add memory barriers here + * because this function is called with @c->tnc_mutex locked. + */ __clear_bit(DIRTY_ZNODE, &znode->flags); __clear_bit(COW_ZNODE, &znode->flags); @@ -377,7 +382,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) c->gap_lebs = NULL; return err; } - if (dbg_force_in_the_gaps_enabled()) { + if (!dbg_is_chk_index(c)) { /* * Do not print scary warnings if the debugging * option which forces in-the-gaps is enabled. @@ -491,25 +496,6 @@ static int layout_in_empty_space(struct ubifs_info *c) else next_len = ubifs_idx_node_sz(c, cnext->child_cnt); - if (c->min_io_size == 1) { - buf_offs += ALIGN(len, 8); - if (next_len) { - if (buf_offs + next_len <= c->leb_size) - continue; - err = ubifs_update_one_lp(c, lnum, 0, - c->leb_size - buf_offs, 0, 0); - if (err) - return err; - lnum = -1; - continue; - } - err = ubifs_update_one_lp(c, lnum, - c->leb_size - buf_offs, 0, 0, 0); - if (err) - return err; - break; - } - /* Update buffer positions */ wlen = used + len; used += ALIGN(len, 8); @@ -658,7 +644,7 @@ static int get_znodes_to_commit(struct ubifs_info *c) } cnt += 1; while (1) { - ubifs_assert(!test_bit(COW_ZNODE, &znode->flags)); + ubifs_assert(!ubifs_zn_cow(znode)); __set_bit(COW_ZNODE, &znode->flags); znode->alt = 0; cnext = find_next_dirty(znode); @@ -704,7 +690,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_force_in_the_gaps()) + if (dbg_is_chk_index(c) && !(random32() & 7)) return -ENOSPC; return 0; } @@ -830,7 +816,7 @@ static int write_index(struct ubifs_info *c) struct ubifs_idx_node *idx; struct ubifs_znode *znode, *cnext; int i, lnum, offs, len, next_len, buf_len, buf_offs, used; - int avail, wlen, err, lnum_pos = 0; + int avail, wlen, err, lnum_pos = 0, blen, nxt_offs; cnext = c->enext; if (!cnext) @@ -907,7 +893,7 @@ static int write_index(struct ubifs_info *c) cnext = znode->cnext; ubifs_assert(ubifs_zn_dirty(znode)); - ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + ubifs_assert(ubifs_zn_cow(znode)); /* * It is important that other threads should see %DIRTY_ZNODE @@ -922,6 +908,28 @@ static int write_index(struct ubifs_info *c) clear_bit(COW_ZNODE, &znode->flags); smp_mb__after_clear_bit(); + /* + * We have marked the znode as clean but have not updated the + * @c->clean_zn_cnt counter. If this znode becomes dirty again + * before 'free_obsolete_znodes()' is called, then + * @c->clean_zn_cnt will be decremented before it gets + * incremented (resulting in 2 decrements for the same znode). + * This means that @c->clean_zn_cnt may become negative for a + * while. + * + * Q: why we cannot increment @c->clean_zn_cnt? + * A: because we do not have the @c->tnc_mutex locked, and the + * following code would be racy and buggy: + * + * if (!ubifs_zn_obsolete(znode)) { + * atomic_long_inc(&c->clean_zn_cnt); + * atomic_long_inc(&ubifs_clean_zn_cnt); + * } + * + * Thus, we just delay the @c->clean_zn_cnt update until we + * have the mutex locked. + */ + /* Do not access znode from this point on */ /* Update buffer positions */ @@ -938,65 +946,38 @@ static int write_index(struct ubifs_info *c) else next_len = ubifs_idx_node_sz(c, cnext->child_cnt); - if (c->min_io_size == 1) { - /* - * Write the prepared index node immediately if there is - * no minimum IO size - */ - err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, - wlen, UBI_SHORTTERM); - if (err) - return err; - buf_offs += ALIGN(wlen, 8); - if (next_len) { - used = 0; - avail = buf_len; - if (buf_offs + next_len > c->leb_size) { - err = ubifs_update_one_lp(c, lnum, - LPROPS_NC, 0, 0, LPROPS_TAKEN); - if (err) - return err; - lnum = -1; - } + nxt_offs = buf_offs + used + next_len; + if (next_len && nxt_offs <= c->leb_size) { + if (avail > 0) continue; - } + else + blen = buf_len; } else { - int blen, nxt_offs = buf_offs + used + next_len; - - if (next_len && nxt_offs <= c->leb_size) { - if (avail > 0) - continue; - else - blen = buf_len; - } else { - wlen = ALIGN(wlen, 8); - blen = ALIGN(wlen, c->min_io_size); - ubifs_pad(c, c->cbuf + wlen, blen - wlen); - } - /* - * The buffer is full or there are no more znodes - * to do - */ - err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, - blen, UBI_SHORTTERM); - if (err) - return err; - buf_offs += blen; - if (next_len) { - if (nxt_offs > c->leb_size) { - err = ubifs_update_one_lp(c, lnum, - LPROPS_NC, 0, 0, LPROPS_TAKEN); - if (err) - return err; - lnum = -1; - } - used -= blen; - if (used < 0) - used = 0; - avail = buf_len - used; - memmove(c->cbuf, c->cbuf + blen, used); - continue; + wlen = ALIGN(wlen, 8); + blen = ALIGN(wlen, c->min_io_size); + ubifs_pad(c, c->cbuf + wlen, blen - wlen); + } + + /* The buffer is full or there are no more znodes to do */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen, + UBI_SHORTTERM); + if (err) + return err; + buf_offs += blen; + if (next_len) { + if (nxt_offs > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, + 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + memmove(c->cbuf, c->cbuf + blen, used); + continue; } break; } @@ -1029,7 +1010,7 @@ static void free_obsolete_znodes(struct ubifs_info *c) do { znode = cnext; cnext = znode->cnext; - if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + if (ubifs_zn_obsolete(znode)) kfree(znode); else { znode->cnext = NULL; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index f79983d6f86..702b79258e3 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -230,14 +230,14 @@ enum { * LPT cnode flag bits. * * DIRTY_CNODE: cnode is dirty - * COW_CNODE: cnode is being committed and must be copied before writing * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted), - * so it can (and must) be freed when the commit is finished + * so it can (and must) be freed when the commit is finished + * COW_CNODE: cnode is being committed and must be copied before writing */ enum { DIRTY_CNODE = 0, - COW_CNODE = 1, - OBSOLETE_CNODE = 2, + OBSOLETE_CNODE = 1, + COW_CNODE = 2, }; /* @@ -1468,6 +1468,15 @@ extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /* io.c */ void ubifs_ro_mode(struct ubifs_info *c, int err); +int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, + int len, int even_ebadmsg); +int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len, int dtype); +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, + int dtype); +int ubifs_leb_unmap(struct ubifs_info *c, int lnum); +int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype); +int ubifs_is_mapped(const struct ubifs_info *c, int lnum); int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, int dtype); @@ -1747,8 +1756,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf, int jhead); struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf); -int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf); -int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf); +int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf); +int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf); int ubifs_rcvry_gc_commit(struct ubifs_info *c); int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, int deletion, loff_t new_size); diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 29309e25417..b57aab9a118 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -56,16 +56,12 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru lock_ufs(dir->i_sb); ino = ufs_inode_by_name(dir, &dentry->d_name); - if (ino) { + if (ino) inode = ufs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) { - unlock_ufs(dir->i_sb); - return ERR_CAST(inode); - } - } unlock_ufs(dir->i_sb); - d_add(dentry, inode); - return NULL; + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_splice_alias(inode, dentry); } /* |