diff options
Diffstat (limited to 'fs')
63 files changed, 2082 insertions, 436 deletions
@@ -476,14 +476,21 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total) batch->count = total; } -static void kiocb_batch_free(struct kiocb_batch *batch) +static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) { struct kiocb *req, *n; + if (list_empty(&batch->head)) + return; + + spin_lock_irq(&ctx->ctx_lock); list_for_each_entry_safe(req, n, &batch->head, ki_batch) { list_del(&req->ki_batch); + list_del(&req->ki_list); kmem_cache_free(kiocb_cachep, req); + ctx->reqs_active--; } + spin_unlock_irq(&ctx->ctx_lock); } /* @@ -1742,7 +1749,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, } blk_finish_plug(&plug); - kiocb_batch_free(&batch); + kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 9ef5b291440..da8876d38a7 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -76,7 +76,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, data += wr; bytes -= wr; } - mutex_lock(&sbi->pipe_mutex); + mutex_unlock(&sbi->pipe_mutex); set_fs(fs); diff --git a/fs/block_dev.c b/fs/block_dev.c index afe74dda632..0e575d1304b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1139,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { struct backing_dev_info *bdi; @@ -1159,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; + bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk(disk); @@ -1232,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; + bdev->bd_queue = NULL; bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f99a099a774..d8525662ca7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { /* * we can't safely write a btree page from here, @@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 74fd74719dc..618246bc219 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -973,7 +973,7 @@ static int dentry_lease_is_valid(struct dentry *dentry) spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di && di->lease_session) { + if (di->lease_session) { s = di->lease_session; spin_lock(&s->s_cap_lock); gen = s->s_cap_gen; @@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry) struct ceph_dentry_info *di = ceph_dentry(dentry); dout("d_release %p\n", dentry); - if (di) { - ceph_dentry_lru_del(dentry); - if (di->lease_session) - ceph_put_mds_session(di->lease_session); - kmem_cache_free(ceph_dentry_cachep, di); - dentry->d_fsdata = NULL; - } + ceph_dentry_lru_del(dentry); + if (di->lease_session) + ceph_put_mds_session(di->lease_session); + kmem_cache_free(ceph_dentry_cachep, di); + dentry->d_fsdata = NULL; } static int ceph_snapdir_d_revalidate(struct dentry *dentry, @@ -1096,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, */ void ceph_dir_set_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry) && + ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { + dout(" marking %p (%p) complete\n", inode, dentry); + set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } + dput(dentry); } void ceph_dir_clear_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) complete\n", inode, dentry); + set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } + dput(dentry); } bool ceph_dir_test_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) NOT complete\n", inode, dentry); + clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } + dput(dentry); return false; } @@ -1220,6 +1237,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, do { ceph_mdsc_get_request(req); spin_unlock(&ci->i_unsafe_lock); + dout("dir_fsync %p wait on tid %llu (until %llu)\n", inode, req->r_tid, last_tid); if (req->r_timeout) { @@ -1232,9 +1250,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, } else { wait_for_completion(&req->r_safe_completion); } - spin_lock(&ci->i_unsafe_lock); ceph_mdsc_put_request(req); + spin_lock(&ci->i_unsafe_lock); if (ret || list_empty(head)) break; req = list_entry(head->next, @@ -1259,13 +1277,11 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); - if (di) { - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_add_tail(&di->lru, &mdsc->dentry_lru); - mdsc->num_dentry++; - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_add_tail(&di->lru, &mdsc->dentry_lru); + mdsc->num_dentry++; + spin_unlock(&mdsc->dentry_lru_lock); } void ceph_dentry_lru_touch(struct dentry *dn) @@ -1275,12 +1291,10 @@ void ceph_dentry_lru_touch(struct dentry *dn) dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, dn->d_name.len, dn->d_name.name, di->offset); - if (di) { - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_move_tail(&di->lru, &mdsc->dentry_lru); - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_move_tail(&di->lru, &mdsc->dentry_lru); + spin_unlock(&mdsc->dentry_lru_lock); } void ceph_dentry_lru_del(struct dentry *dn) @@ -1290,13 +1304,11 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); - if (di) { - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_del_init(&di->lru); - mdsc->num_dentry--; - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_del_init(&di->lru); + mdsc->num_dentry--; + spin_unlock(&mdsc->dentry_lru_lock); } /* diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 9fbcdecaacc..fbb2a643ef1 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, return -EINVAL; spin_lock(&dentry->d_lock); - parent = dget(dentry->d_parent); - spin_unlock(&dentry->d_lock); - + parent = dentry->d_parent; if (*max_len >= connected_handle_length) { dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); @@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, *max_len = handle_length; type = 255; } - dput(parent); + spin_unlock(&dentry->d_lock); return type; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 25283e7a37f..2c489378b4c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -850,11 +850,12 @@ static void ceph_set_dentry_offset(struct dentry *dn) { struct dentry *dir = dn->d_parent; struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_info *ci; struct ceph_dentry_info *di; BUG_ON(!inode); + ci = ceph_inode(inode); di = ceph_dentry(dn); spin_lock(&ci->i_ceph_lock); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6203d805eb4..23ab6a3f182 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2772,7 +2772,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, di = ceph_dentry(dentry); switch (h->action) { case CEPH_MDS_LEASE_REVOKE: - if (di && di->lease_session == session) { + if (di->lease_session == session) { if (ceph_seq_cmp(di->lease_seq, seq) > 0) h->seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); @@ -2781,7 +2781,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, break; case CEPH_MDS_LEASE_RENEW: - if (di && di->lease_session == session && + if (di->lease_session == session && di->lease_gen == session->s_cap_gen && di->lease_renew_from && di->lease_renew_after == 0) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 48f61a12af6..00de2c9568c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -131,6 +131,8 @@ enum { Opt_rbytes, Opt_norbytes, Opt_noasyncreaddir, + Opt_dcache, + Opt_nodcache, Opt_ino32, }; @@ -152,6 +154,8 @@ static match_table_t fsopt_tokens = { {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, {Opt_noasyncreaddir, "noasyncreaddir"}, + {Opt_dcache, "dcache"}, + {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, {-1, NULL} }; @@ -231,6 +235,12 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; + case Opt_dcache: + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + break; + case Opt_nodcache: + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + break; case Opt_ino32: fsopt->flags |= CEPH_MOUNT_OPT_INO32; break; @@ -377,6 +387,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",norbytes"); if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); + if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) + seq_puts(m, ",dcache"); + else + seq_puts(m, ",nodcache"); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); @@ -647,10 +661,10 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, root = ERR_PTR(-ENOMEM); goto out; } - ceph_init_dentry(root); } else { root = d_obtain_alias(inode); } + ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); } else { root = ERR_PTR(err); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cb3652b3727..1421f3d875a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -28,6 +28,7 @@ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ +#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a5e36e4488a..857214ae8c0 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -818,6 +818,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int issued; int err; + int required_blob_size; int dirty; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -833,14 +834,34 @@ int ceph_removexattr(struct dentry *dentry, const char *name) return -EOPNOTSUPP; } + err = -ENOMEM; spin_lock(&ci->i_ceph_lock); __build_xattrs(inode); +retry: issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + required_blob_size = __get_required_blob_size(ci, 0, 0); + + if (!ci->i_xattrs.prealloc_blob || + required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { + struct ceph_buffer *blob; + + spin_unlock(&ci->i_ceph_lock); + dout(" preaallocating new blob size=%d\n", required_blob_size); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); + if (!blob) + goto out; + spin_lock(&ci->i_ceph_lock); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ci->i_xattrs.prealloc_blob = blob; + goto retry; + } + err = __remove_xattr_by_name(ceph_inode(inode), name); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; @@ -853,6 +874,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) do_sync: spin_unlock(&ci->i_ceph_lock); err = ceph_send_removexattr(dentry, name); +out: return err; } diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index a10e428b32b..a26bea10e81 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -105,6 +105,7 @@ #include <linux/hiddev.h> +#define __DVB_CORE__ #include <linux/dvb/audio.h> #include <linux/dvb/dmx.h> #include <linux/dvb/frontend.h> diff --git a/fs/dcache.c b/fs/dcache.c index 616fedff011..16a53cc2cc0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1475,7 +1475,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode) return alias; } -static struct dentry * d_find_any_alias(struct inode *inode) +/** + * d_find_any_alias - find any alias for a given inode + * @inode: inode to find an alias for + * + * If any aliases exist for the given inode, take and return a + * reference for one of them. If no aliases exist, return %NULL. + */ +struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; @@ -1484,7 +1491,7 @@ static struct dentry * d_find_any_alias(struct inode *inode) spin_unlock(&inode->i_lock); return de; } - +EXPORT_SYMBOL(d_find_any_alias); /** * d_obtain_alias - find or allocate a dentry for a given inode diff --git a/fs/direct-io.c b/fs/direct-io.c index d740ab67ff6..4a588dbd11b 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -36,6 +36,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <linux/atomic.h> +#include <linux/prefetch.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, { int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ - unsigned long dio_count;/* Number of dio_block-sized blocks */ - unsigned long blkmask; int create; /* @@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, if (ret == 0) { BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); fs_startblk = sdio->block_in_file >> sdio->blkfactor; - dio_count = sdio->final_block_in_request - sdio->block_in_file; - fs_count = dio_count >> sdio->blkfactor; - blkmask = (1 << sdio->blkfactor) - 1; - if (dio_count & blkmask) - fs_count++; + fs_endblk = (sdio->final_block_in_request - 1) >> + sdio->blkfactor; + fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; @@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +static inline ssize_t +do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) @@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, size_t size; unsigned long addr; unsigned blkbits = inode->i_blkbits; - unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw & WRITE) rw = WRITE_ODIRECT; - if (bdev) - bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + /* + * Avoid references to bdev if not absolutely needed to give + * the early prefetch in the caller enough time. + */ if (offset & blocksize_mask) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits(bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; if (offset & blocksize_mask) goto out; @@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) { + if (unlikely((addr & blocksize_mask) || + (size & blocksize_mask))) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits( + bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) + if ((addr & blocksize_mask) || (size & blocksize_mask)) goto out; } } @@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, out: return retval; } + +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + /* + * The block device state is needed in the end to finally + * submit everything. Since it's likely to be cache cold + * prefetch it here as first thing to hide some of the + * latency. + * + * Attempt to prefetch the pieces we likely need later. + */ + prefetch(&bdev->bd_disk->part_tbl); + prefetch(bdev->bd_queue); + prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); + + return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, + submit_io, flags); +} + EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 828e750af23..aabdfc38cf2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -197,6 +197,12 @@ struct eventpoll { /* The user that created the eventpoll descriptor */ struct user_struct *user; + + struct file *file; + + /* used to optimize loop detection check */ + int visited; + struct list_head visited_list_link; }; /* Wait structure used by the poll hooks */ @@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ +static LIST_HEAD(visited_list); + +/* + * List of files with newly added links, where we may need to limit the number + * of emanating paths. Protected by the epmutex. + */ +static LIST_HEAD(tfile_check_list); + #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -276,6 +291,12 @@ ctl_table epoll_table[] = { }; #endif /* CONFIG_SYSCTL */ +static const struct file_operations eventpoll_fops; + +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an eventpoll file */ -static inline int is_file_epoll(struct file *f) -{ - return f->f_op == &eventpoll_fops; -} - /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are @@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + +#define PATH_ARR_SIZE 5 +/* + * These are the number paths of length 1 to 5, that we are allowing to emanate + * from a single file of interest. For example, we allow 1000 paths of length + * 1, to emanate from each file of interest. This essentially represents the + * potential wakeup paths, which need to be limited in order to avoid massive + * uncontrolled wakeup storms. The common use case should be a single ep which + * is connected to n file sources. In this case each file source has 1 path + * of length 1. Thus, the numbers below should be more than sufficient. These + * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify + * and delete can't add additional paths. Protected by the epmutex. + */ +static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; +static int path_count[PATH_ARR_SIZE]; + +static int path_count_inc(int nests) +{ + if (++path_count[nests] > path_limits[nests]) + return -1; + return 0; +} + +static void path_count_init(void) +{ + int i; + + for (i = 0; i < PATH_ARR_SIZE; i++) + path_count[i] = 0; +} + +static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct file *child_file; + struct epitem *epi; + + list_for_each_entry(epi, &file->f_ep_links, fllink) { + child_file = epi->ep->file; + if (is_file_epoll(child_file)) { + if (list_empty(&child_file->f_ep_links)) { + if (path_count_inc(call_nests)) { + error = -1; + break; + } + } else { + error = ep_call_nested(&poll_loop_ncalls, + EP_MAX_NESTS, + reverse_path_check_proc, + child_file, child_file, + current); + } + if (error != 0) + break; + } else { + printk(KERN_ERR "reverse_path_check_proc: " + "file is not an ep!\n"); + } + } + return error; +} + +/** + * reverse_path_check - The tfile_check_list is list of file *, which have + * links that are proposed to be newly added. We need to + * make sure that those added links don't add too many + * paths such that we will spend all our time waking up + * eventpoll objects. + * + * Returns: Returns zero if the proposed links don't create too many paths, + * -1 otherwise. + */ +static int reverse_path_check(void) +{ + int length = 0; + int error = 0; + struct file *current_file; + + /* let's call this for all tfiles */ + list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + length++; + path_count_init(); + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + reverse_path_check_proc, current_file, + current_file, current); + if (error) + break; + } + return error; +} + /* * Must be called with "mtx" held. */ @@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, */ ep_rbtree_insert(ep, epi); + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (reverse_path_check()) + goto error_remove_epi; + /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); @@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, return 0; +error_remove_epi: + spin_lock(&tfile->f_lock); + if (ep_is_linked(&epi->fllink)) + list_del_init(&epi->fllink); + spin_unlock(&tfile->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + error_unregister: ep_unregister_pollwait(ep, epi); @@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) int error = 0; struct file *file = priv; struct eventpoll *ep = file->private_data; + struct eventpoll *ep_tovisit; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); + ep->visited = 1; + list_add(&ep->visited_list_link, &visited_list); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { + ep_tovisit = epi->ffd.file->private_data; + if (ep_tovisit->visited) + continue; error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, - ep_loop_check_proc, epi->ffd.file, - epi->ffd.file->private_data, current); + ep_loop_check_proc, epi->ffd.file, + ep_tovisit, current); if (error != 0) break; + } else { + /* + * If we've reached a file that is not associated with + * an ep, then we need to check if the newly added + * links are going to add too many wakeup paths. We do + * this by adding it to the tfile_check_list, if it's + * not already there, and calling reverse_path_check() + * during ep_insert(). + */ + if (list_empty(&epi->ffd.file->f_tfile_llink)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); } } mutex_unlock(&ep->mtx); @@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + int ret; + struct eventpoll *ep_cur, *ep_next; + + ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ep_loop_check_proc, file, ep, current); + /* clear visited list */ + list_for_each_entry_safe(ep_cur, ep_next, &visited_list, + visited_list_link) { + ep_cur->visited = 0; + list_del(&ep_cur->visited_list_link); + } + return ret; +} + +static void clear_tfile_check_list(void) +{ + struct file *file; + + /* first clear the tfile_check_list */ + while (!list_empty(&tfile_check_list)) { + file = list_first_entry(&tfile_check_list, struct file, + f_tfile_llink); + list_del_init(&file->f_tfile_llink); + } + INIT_LIST_HEAD(&tfile_check_list); } /* @@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error; + int error, fd; struct eventpoll *ep = NULL; + struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); @@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); + if (fd < 0) { + error = fd; + goto out_free_ep; + } + file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); - if (error < 0) - ep_free(ep); - + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_fd; + } + fd_install(fd, file); + ep->file = file; + return fd; + +out_free_fd: + put_unused_fd(fd); +out_free_ep: + ep_free(ep); return error; } @@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* * When we insert an epoll file descriptor, inside another epoll file * descriptor, there is the change of creating closed loops, which are - * better be handled here, than in more critical paths. + * better be handled here, than in more critical paths. While we are + * checking for loops we also determine the list of files reachable + * and hang them on the tfile_check_list, so we can check that we + * haven't created too many possible wakeup paths. * - * We hold epmutex across the loop check and the insert in this case, in - * order to prevent two separate inserts from racing and each doing the - * insert "at the same time" such that ep_loop_check passes on both - * before either one does the insert, thereby creating a cycle. + * We need to hold the epmutex across both ep_insert and ep_remove + * b/c we want to make sure we are looking at a coherent view of + * epoll network. */ - if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { mutex_lock(&epmutex); did_lock_epmutex = 1; - error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) - goto error_tgt_fput; } - + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } mutex_lock_nested(&ep->mtx, 0); @@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; + clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: - if (unlikely(did_lock_epmutex)) + if (did_lock_epmutex) mutex_unlock(&epmutex); fput(tfile); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 88e8a23d002..376816fcd04 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1353,7 +1353,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) spin_lock(&gl->gl_spin); gl->gl_reply = ret; - if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); spin_unlock(&gl->gl_spin); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 2553b858a72..307ac31df78 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -121,8 +121,11 @@ enum { struct lm_lockops { const char *lm_proto_name; - int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); - void (*lm_unmount) (struct gfs2_sbd *sdp); + int (*lm_mount) (struct gfs2_sbd *sdp, const char *table); + void (*lm_first_done) (struct gfs2_sbd *sdp); + void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result); + void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); void (*lm_put_lock) (struct gfs2_glock *gl); int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e1d3bb59945..97742a7ea9c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -139,8 +139,45 @@ struct gfs2_bufdata { #define GDLM_STRNAME_BYTES 25 #define GDLM_LVB_SIZE 32 +/* + * ls_recover_flags: + * + * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been + * held by failed nodes whose journals need recovery. Those locks should + * only be used for journal recovery until the journal recovery is done. + * This is set by the dlm recover_prep callback and cleared by the + * gfs2_control thread when journal recovery is complete. To avoid + * races between recover_prep setting and gfs2_control clearing, recover_spin + * is held while changing this bit and reading/writing recover_block + * and recover_start. + * + * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used. + * + * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing + * recovery of all journals before allowing other nodes to mount the fs. + * This is cleared when FIRST_MOUNT_DONE is set. + * + * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished + * recovery of all journals, and now allows other nodes to mount the fs. + * + * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared + * BLOCK_LOCKS for the first time. The gfs2_control thread should now + * control clearing BLOCK_LOCKS for further recoveries. + * + * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq. + * + * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep() + * and recover_done(), i.e. set while recover_block == recover_start. + */ + enum { DFL_BLOCK_LOCKS = 0, + DFL_NO_DLM_OPS = 1, + DFL_FIRST_MOUNT = 2, + DFL_FIRST_MOUNT_DONE = 3, + DFL_MOUNT_DONE = 4, + DFL_UNMOUNT = 5, + DFL_DLM_RECOVERY = 6, }; struct lm_lockname { @@ -392,6 +429,7 @@ struct gfs2_jdesc { #define JDF_RECOVERY 1 unsigned int jd_jid; unsigned int jd_blocks; + int jd_recover_error; }; struct gfs2_statfs_change_host { @@ -461,6 +499,7 @@ enum { SDF_NORECOVERY = 4, SDF_DEMOTE = 5, SDF_NOJOURNALID = 6, + SDF_RORECOVERY = 7, /* read only recovery */ }; #define GFS2_FSNAME_LEN 256 @@ -499,14 +538,26 @@ struct gfs2_sb_host { struct lm_lockstruct { int ls_jid; unsigned int ls_first; - unsigned int ls_first_done; unsigned int ls_nodir; const struct lm_lockops *ls_ops; - unsigned long ls_flags; dlm_lockspace_t *ls_dlm; - int ls_recover_jid_done; - int ls_recover_jid_status; + int ls_recover_jid_done; /* These two are deprecated, */ + int ls_recover_jid_status; /* used previously by gfs_controld */ + + struct dlm_lksb ls_mounted_lksb; /* mounted_lock */ + struct dlm_lksb ls_control_lksb; /* control_lock */ + char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ + struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + + spinlock_t ls_recover_spin; /* protects following fields */ + unsigned long ls_recover_flags; /* DFL_ */ + uint32_t ls_recover_mount; /* gen in first recover_done cb */ + uint32_t ls_recover_start; /* gen in last recover_done cb */ + uint32_t ls_recover_block; /* copy recover_start in last recover_prep */ + uint32_t ls_recover_size; /* size of recover_submit, recover_result */ + uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */ + uint32_t *ls_recover_result; /* result of last jid recovery */ }; struct gfs2_sbd { @@ -544,6 +595,7 @@ struct gfs2_sbd { wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; + struct delayed_work sd_control_work; /* Inode Stuff */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 017960cf1d7..a7d611b93f0 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -599,9 +599,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto fail_end_trans; - inc_nlink(&ip->i_inode); - if (S_ISDIR(ip->i_inode.i_mode)) - inc_nlink(&ip->i_inode); + set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index ce85b62bc0a..8944d1e32ab 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. + * Copyright 2004-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -11,12 +11,15 @@ #include <linux/dlm.h> #include <linux/slab.h> #include <linux/types.h> +#include <linux/delay.h> #include <linux/gfs2_ondisk.h> #include "incore.h" #include "glock.h" #include "util.h" +#include "sys.h" +extern struct workqueue_struct *gfs2_control_wq; static void gdlm_ast(void *arg) { @@ -185,34 +188,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl) dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); } -static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname) +/* + * dlm/gfs2 recovery coordination using dlm_recover callbacks + * + * 1. dlm_controld sees lockspace members change + * 2. dlm_controld blocks dlm-kernel locking activity + * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep) + * 4. dlm_controld starts and finishes its own user level recovery + * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery + * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot) + * 7. dlm_recoverd does its own lock recovery + * 8. dlm_recoverd unblocks dlm-kernel locking activity + * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation) + * 10. gfs2_control updates control_lock lvb with new generation and jid bits + * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none) + * 12. gfs2_recover dequeues and recovers journals of failed nodes + * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result) + * 14. gfs2_control updates control_lock lvb jid bits for recovered journals + * 15. gfs2_control unblocks normal locking when all journals are recovered + * + * - failures during recovery + * + * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control + * clears BLOCK_LOCKS (step 15), e.g. another node fails while still + * recovering for a prior failure. gfs2_control needs a way to detect + * this so it can leave BLOCK_LOCKS set in step 15. This is managed using + * the recover_block and recover_start values. + * + * recover_done() provides a new lockspace generation number each time it + * is called (step 9). This generation number is saved as recover_start. + * When recover_prep() is called, it sets BLOCK_LOCKS and sets + * recover_block = recover_start. So, while recover_block is equal to + * recover_start, BLOCK_LOCKS should remain set. (recover_spin must + * be held around the BLOCK_LOCKS/recover_block/recover_start logic.) + * + * - more specific gfs2 steps in sequence above + * + * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start + * 6. recover_slot records any failed jids (maybe none) + * 9. recover_done sets recover_start = new generation number + * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids + * 12. gfs2_recover does journal recoveries for failed jids identified above + * 14. gfs2_control clears control_lock lvb bits for recovered jids + * 15. gfs2_control checks if recover_block == recover_start (step 3 occured + * again) then do nothing, otherwise if recover_start > recover_block + * then clear BLOCK_LOCKS. + * + * - parallel recovery steps across all nodes + * + * All nodes attempt to update the control_lock lvb with the new generation + * number and jid bits, but only the first to get the control_lock EX will + * do so; others will see that it's already done (lvb already contains new + * generation number.) + * + * . All nodes get the same recover_prep/recover_slot/recover_done callbacks + * . All nodes attempt to set control_lock lvb gen + bits for the new gen + * . One node gets control_lock first and writes the lvb, others see it's done + * . All nodes attempt to recover jids for which they see control_lock bits set + * . One node succeeds for a jid, and that one clears the jid bit in the lvb + * . All nodes will eventually see all lvb bits clear and unblock locks + * + * - is there a problem with clearing an lvb bit that should be set + * and missing a journal recovery? + * + * 1. jid fails + * 2. lvb bit set for step 1 + * 3. jid recovered for step 1 + * 4. jid taken again (new mount) + * 5. jid fails (for step 4) + * 6. lvb bit set for step 5 (will already be set) + * 7. lvb bit cleared for step 3 + * + * This is not a problem because the failure in step 5 does not + * require recovery, because the mount in step 4 could not have + * progressed far enough to unblock locks and access the fs. The + * control_mount() function waits for all recoveries to be complete + * for the latest lockspace generation before ever unblocking locks + * and returning. The mount in step 4 waits until the recovery in + * step 1 is done. + * + * - special case of first mounter: first node to mount the fs + * + * The first node to mount a gfs2 fs needs to check all the journals + * and recover any that need recovery before other nodes are allowed + * to mount the fs. (Others may begin mounting, but they must wait + * for the first mounter to be done before taking locks on the fs + * or accessing the fs.) This has two parts: + * + * 1. The mounted_lock tells a node it's the first to mount the fs. + * Each node holds the mounted_lock in PR while it's mounted. + * Each node tries to acquire the mounted_lock in EX when it mounts. + * If a node is granted the mounted_lock EX it means there are no + * other mounted nodes (no PR locks exist), and it is the first mounter. + * The mounted_lock is demoted to PR when first recovery is done, so + * others will fail to get an EX lock, but will get a PR lock. + * + * 2. The control_lock blocks others in control_mount() while the first + * mounter is doing first mount recovery of all journals. + * A mounting node needs to acquire control_lock in EX mode before + * it can proceed. The first mounter holds control_lock in EX while doing + * the first mount recovery, blocking mounts from other nodes, then demotes + * control_lock to NL when it's done (others_may_mount/first_done), + * allowing other nodes to continue mounting. + * + * first mounter: + * control_lock EX/NOQUEUE success + * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters) + * set first=1 + * do first mounter recovery + * mounted_lock EX->PR + * control_lock EX->NL, write lvb generation + * + * other mounter: + * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry) + * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR) + * mounted_lock PR/NOQUEUE success + * read lvb generation + * control_lock EX->NL + * set first=0 + * + * - mount during recovery + * + * If a node mounts while others are doing recovery (not first mounter), + * the mounting node will get its initial recover_done() callback without + * having seen any previous failures/callbacks. + * + * It must wait for all recoveries preceding its mount to be finished + * before it unblocks locks. It does this by repeating the "other mounter" + * steps above until the lvb generation number is >= its mount generation + * number (from initial recover_done) and all lvb bits are clear. + * + * - control_lock lvb format + * + * 4 bytes generation number: the latest dlm lockspace generation number + * from recover_done callback. Indicates the jid bitmap has been updated + * to reflect all slot failures through that generation. + * 4 bytes unused. + * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates + * that jid N needs recovery. + */ + +#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */ + +static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, + char *lvb_bits) +{ + uint32_t gen; + memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); + memcpy(&gen, lvb_bits, sizeof(uint32_t)); + *lvb_gen = le32_to_cpu(gen); +} + +static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, + char *lvb_bits) +{ + uint32_t gen; + memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); + gen = cpu_to_le32(lvb_gen); + memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t)); +} + +static int all_jid_bits_clear(char *lvb) +{ + int i; + for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) { + if (lvb[i]) + return 0; + } + return 1; +} + +static void sync_wait_cb(void *arg) +{ + struct lm_lockstruct *ls = arg; + complete(&ls->ls_sync_wait); +} + +static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; - if (fsname == NULL) { - fs_info(sdp, "no fsname found\n"); - return -EINVAL; + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + if (error) { + fs_err(sdp, "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + fs_err(sdp, "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, + unsigned int num, struct dlm_lksb *lksb, char *name) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char strname[GDLM_STRNAME_BYTES]; + int error, status; + + memset(strname, 0, GDLM_STRNAME_BYTES); + snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); + + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + if (error) { + fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + +static int mounted_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK, + &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int control_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock"); +} + +static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK, + &ls->ls_control_lksb, "control_lock"); +} + +static void gfs2_control_func(struct work_struct *work) +{ + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char lvb_bits[GDLM_LVB_SIZE]; + uint32_t block_gen, start_gen, lvb_gen, flags; + int recover_set = 0; + int write_lvb = 0; + int recover_size; + int i, error; + + spin_lock(&ls->ls_recover_spin); + /* + * No MOUNT_DONE means we're still mounting; control_mount() + * will set this flag, after which this thread will take over + * all further clearing of BLOCK_LOCKS. + * + * FIRST_MOUNT means this node is doing first mounter recovery, + * for which recovery control is handled by + * control_mount()/control_first_done(), not this thread. + */ + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + spin_unlock(&ls->ls_recover_spin); + + /* + * Equal block_gen and start_gen implies we are between + * recover_prep and recover_done callbacks, which means + * dlm recovery is in progress and dlm locking is blocked. + * There's no point trying to do any work until recover_done. + */ + + if (block_gen == start_gen) + return; + + /* + * Propagate recover_submit[] and recover_result[] to lvb: + * dlm_recoverd adds to recover_submit[] jids needing recovery + * gfs2_recover adds to recover_result[] journal recovery results + * + * set lvb bit for jids in recover_submit[] if the lvb has not + * yet been updated for the generation of the failure + * + * clear lvb bit for jids in recover_result[] if the result of + * the journal recovery is SUCCESS + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control lock EX error %d\n", error); + return; + } + + control_lvb_read(ls, &lvb_gen, lvb_bits); + + spin_lock(&ls->ls_recover_spin); + if (block_gen != ls->ls_recover_block || + start_gen != ls->ls_recover_start) { + fs_info(sdp, "recover generation %u block1 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + return; + } + + recover_size = ls->ls_recover_size; + + if (lvb_gen <= start_gen) { + /* + * Clear lvb bits for jids we've successfully recovered. + * Because all nodes attempt to recover failed journals, + * a journal can be recovered multiple times successfully + * in succession. Only the first will really do recovery, + * the others find it clean, but still report a successful + * recovery. So, another node may have already recovered + * the jid and cleared the lvb bit for it. + */ + for (i = 0; i < recover_size; i++) { + if (ls->ls_recover_result[i] != LM_RD_SUCCESS) + continue; + + ls->ls_recover_result[i] = 0; + + if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) + continue; + + __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + write_lvb = 1; + } + } + + if (lvb_gen == start_gen) { + /* + * Failed slots before start_gen are already set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < lvb_gen) + ls->ls_recover_submit[i] = 0; + } + } else if (lvb_gen < start_gen) { + /* + * Failed slots before start_gen are not yet set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < start_gen) { + ls->ls_recover_submit[i] = 0; + __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + } + } + /* even if there are no bits to set, we need to write the + latest generation to the lvb */ + write_lvb = 1; + } else { + /* + * we should be getting a recover_done() for lvb_gen soon + */ + } + spin_unlock(&ls->ls_recover_spin); + + if (write_lvb) { + control_lvb_write(ls, start_gen, lvb_bits); + flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK; + } else { + flags = DLM_LKF_CONVERT; + } + + error = control_lock(sdp, DLM_LOCK_NL, flags); + if (error) { + fs_err(sdp, "control lock NL error %d\n", error); + return; + } + + /* + * Everyone will see jid bits set in the lvb, run gfs2_recover_set(), + * and clear a jid bit in the lvb if the recovery is a success. + * Eventually all journals will be recovered, all jid bits will + * be cleared in the lvb, and everyone will clear BLOCK_LOCKS. + */ + + for (i = 0; i < recover_size; i++) { + if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) { + fs_info(sdp, "recover generation %u jid %d\n", + start_gen, i); + gfs2_recover_set(sdp, i); + recover_set++; + } + } + if (recover_set) + return; + + /* + * No more jid bits set in lvb, all recovery is done, unblock locks + * (unless a new recover_prep callback has occured blocking locks + * again while working above) + */ + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_block == block_gen && + ls->ls_recover_start == start_gen) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "recover generation %u done\n", start_gen); + gfs2_glock_thaw(sdp); + } else { + fs_info(sdp, "recover generation %u block2 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + } +} + +static int control_mount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char lvb_bits[GDLM_LVB_SIZE]; + uint32_t start_gen, block_gen, mount_gen, lvb_gen; + int mounted_mode; + int retries = 0; + int error; + + memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE); + ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb; + init_completion(&ls->ls_sync_wait); + + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control_mount control_lock NL error %d\n", error); + return error; + } + + error = mounted_lock(sdp, DLM_LOCK_NL, 0); + if (error) { + fs_err(sdp, "control_mount mounted_lock NL error %d\n", error); + control_unlock(sdp); + return error; + } + mounted_mode = DLM_LOCK_NL; + +restart: + if (retries++ && signal_pending(current)) { + error = -EINTR; + goto fail; + } + + /* + * We always start with both locks in NL. control_lock is + * demoted to NL below so we don't need to do it here. + */ + + if (mounted_mode != DLM_LOCK_NL) { + error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + mounted_mode = DLM_LOCK_NL; + } + + /* + * Other nodes need to do some work in dlm recovery and gfs2_control + * before the recover_done and control_lock will be ready for us below. + * A delay here is not required but often avoids having to retry. + */ + + msleep_interruptible(500); + + /* + * Acquire control_lock in EX and mounted_lock in either EX or PR. + * control_lock lvb keeps track of any pending journal recoveries. + * mounted_lock indicates if any other nodes have the fs mounted. + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK); + if (error == -EAGAIN) { + goto restart; + } else if (error) { + fs_err(sdp, "control_mount control_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_EX; + goto locks_done; + } else if (error != -EAGAIN) { + fs_err(sdp, "control_mount mounted_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_PR; + goto locks_done; + } else { + /* not even -EAGAIN should happen here */ + fs_err(sdp, "control_mount mounted_lock PR error %d\n", error); + goto fail; + } + +locks_done: + /* + * If we got both locks above in EX, then we're the first mounter. + * If not, then we need to wait for the control_lock lvb to be + * updated by other mounted nodes to reflect our mount generation. + * + * In simple first mounter cases, first mounter will see zero lvb_gen, + * but in cases where all existing nodes leave/fail before mounting + * nodes finish control_mount, then all nodes will be mounting and + * lvb_gen will be non-zero. + */ + + control_lvb_read(ls, &lvb_gen, lvb_bits); + + if (lvb_gen == 0xFFFFFFFF) { + /* special value to force mount attempts to fail */ + fs_err(sdp, "control_mount control_lock disabled\n"); + error = -EINVAL; + goto fail; + } + + if (mounted_mode == DLM_LOCK_EX) { + /* first mounter, keep both EX while doing first recovery */ + spin_lock(&ls->ls_recover_spin); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "first mounter control generation %u\n", lvb_gen); + return 0; + } + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + + /* + * We are not first mounter, now we need to wait for the control_lock + * lvb generation to be >= the generation from our first recover_done + * and all lvb bits to be clear (no pending journal recoveries.) + */ + + if (!all_jid_bits_clear(lvb_bits)) { + /* journals need recovery, wait until all are clear */ + fs_info(sdp, "control_mount wait for journal recovery\n"); + goto restart; + } + + spin_lock(&ls->ls_recover_spin); + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + mount_gen = ls->ls_recover_mount; + + if (lvb_gen < mount_gen) { + /* wait for mounted nodes to update control_lock lvb to our + generation, which might include new recovery bits set */ + fs_info(sdp, "control_mount wait1 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (lvb_gen != start_gen) { + /* wait for mounted nodes to update control_lock lvb to the + latest recovery generation */ + fs_info(sdp, "control_mount wait2 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (block_gen == start_gen) { + /* dlm recovery in progress, wait for it to finish */ + fs_info(sdp, "control_mount wait3 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; } - error = dlm_new_lockspace(fsname, NULL, - DLM_LSFL_FS | DLM_LSFL_NEWEXCL | - (ls->ls_nodir ? DLM_LSFL_NODIR : 0), - GDLM_LVB_SIZE, NULL, NULL, NULL, &ls->ls_dlm); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + return 0; + +fail: + mounted_unlock(sdp); + control_unlock(sdp); + return error; +} + +static int dlm_recovery_wait(void *word) +{ + schedule(); + return 0; +} + +static int control_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char lvb_bits[GDLM_LVB_SIZE]; + uint32_t start_gen, block_gen; + int error; + +restart: + spin_lock(&ls->ls_recover_spin); + start_gen = ls->ls_recover_start; + block_gen = ls->ls_recover_block; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) || + !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + /* sanity check, should not happen */ + fs_err(sdp, "control_first_done start %u block %u flags %lx\n", + start_gen, block_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + control_unlock(sdp); + return -1; + } + + if (start_gen == block_gen) { + /* + * Wait for the end of a dlm recovery cycle to switch from + * first mounter recovery. We can ignore any recover_slot + * callbacks between the recover_prep and next recover_done + * because we are still the first mounter and any failed nodes + * have not fully mounted, so they don't need recovery. + */ + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "control_first_done wait gen %u\n", start_gen); + + wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, + dlm_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + + clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + + memset(lvb_bits, 0, sizeof(lvb_bits)); + control_lvb_write(ls, start_gen, lvb_bits); + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT); + if (error) + fs_err(sdp, "control_first_done mounted PR error %d\n", error); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK); if (error) - printk(KERN_ERR "dlm_new_lockspace error %d", error); + fs_err(sdp, "control_first_done control NL error %d\n", error); return error; } +/* + * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC) + * to accomodate the largest slot number. (NB dlm slot numbers start at 1, + * gfs2 jids start at 0, so jid = slot - 1) + */ + +#define RECOVER_SIZE_INC 16 + +static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, + int num_slots) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t *submit = NULL; + uint32_t *result = NULL; + uint32_t old_size, new_size; + int i, max_jid; + + max_jid = 0; + for (i = 0; i < num_slots; i++) { + if (max_jid < slots[i].slot - 1) + max_jid = slots[i].slot - 1; + } + + old_size = ls->ls_recover_size; + + if (old_size >= max_jid + 1) + return 0; + + new_size = old_size + RECOVER_SIZE_INC; + + submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + if (!submit || !result) { + kfree(submit); + kfree(result); + return -ENOMEM; + } + + spin_lock(&ls->ls_recover_spin); + memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t)); + memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t)); + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = submit; + ls->ls_recover_result = result; + ls->ls_recover_size = new_size; + spin_unlock(&ls->ls_recover_spin); + return 0; +} + +static void free_recover_size(struct lm_lockstruct *ls) +{ + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + ls->ls_recover_size = 0; +} + +/* dlm calls before it does lock recovery */ + +static void gdlm_recover_prep(void *arg) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_block = ls->ls_recover_start; + set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_prep has been completed on all lockspace members; + identifies slot/jid of failed member */ + +static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int jid = slot->slot - 1; + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recover_slot jid %d gen %u short size %d", + jid, ls->ls_recover_block, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + if (ls->ls_recover_submit[jid]) { + fs_info(sdp, "recover_slot jid %d gen %u prev %u", + jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); + } + ls->ls_recover_submit[jid] = ls->ls_recover_block; + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_slot and after it completes lock recovery */ + +static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, + int our_slot, uint32_t generation) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + /* ensure the ls jid arrays are large enough */ + set_recover_size(sdp, slots, num_slots); + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_start = generation; + + if (!ls->ls_recover_mount) { + ls->ls_recover_mount = generation; + ls->ls_jid = our_slot - 1; + } + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); + + clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); + spin_unlock(&ls->ls_recover_spin); +} + +/* gfs2_recover thread has a journal recovery result */ + +static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + /* don't care about the recovery of own journal during mount */ + if (jid == ls->ls_jid) + return; + + spin_lock(&ls->ls_recover_spin); + if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recovery_result jid %d short size %d", + jid, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + fs_info(sdp, "recover jid %d result %s\n", jid, + result == LM_RD_GAVEUP ? "busy" : "success"); + + ls->ls_recover_result[jid] = result; + + /* GAVEUP means another node is recovering the journal; delay our + next attempt to recover it, to give the other node a chance to + finish before trying again */ + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, + result == LM_RD_GAVEUP ? HZ : 0); + spin_unlock(&ls->ls_recover_spin); +} + +const struct dlm_lockspace_ops gdlm_lockspace_ops = { + .recover_prep = gdlm_recover_prep, + .recover_slot = gdlm_recover_slot, + .recover_done = gdlm_recover_done, +}; + +static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char cluster[GFS2_LOCKNAME_LEN]; + const char *fsname; + uint32_t flags; + int error, ops_result; + + /* + * initialize everything + */ + + INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + spin_lock_init(&ls->ls_recover_spin); + ls->ls_recover_flags = 0; + ls->ls_recover_mount = 0; + ls->ls_recover_start = 0; + ls->ls_recover_block = 0; + ls->ls_recover_size = 0; + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + + error = set_recover_size(sdp, NULL, 0); + if (error) + goto fail; + + /* + * prepare dlm_new_lockspace args + */ + + fsname = strchr(table, ':'); + if (!fsname) { + fs_info(sdp, "no fsname found\n"); + error = -EINVAL; + goto fail_free; + } + memset(cluster, 0, sizeof(cluster)); + memcpy(cluster, table, strlen(table) - strlen(fsname)); + fsname++; + + flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; + if (ls->ls_nodir) + flags |= DLM_LSFL_NODIR; + + /* + * create/join lockspace + */ + + error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, + &gdlm_lockspace_ops, sdp, &ops_result, + &ls->ls_dlm); + if (error) { + fs_err(sdp, "dlm_new_lockspace error %d\n", error); + goto fail_free; + } + + if (ops_result < 0) { + /* + * dlm does not support ops callbacks, + * old dlm_controld/gfs_controld are used, try without ops. + */ + fs_info(sdp, "dlm lockspace ops not used\n"); + free_recover_size(ls); + set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags); + return 0; + } + + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) { + fs_err(sdp, "dlm lockspace ops disallow jid preset\n"); + error = -EINVAL; + goto fail_release; + } + + /* + * control_mount() uses control_lock to determine first mounter, + * and for later mounts, waits for any recoveries to be cleared. + */ + + error = control_mount(sdp); + if (error) { + fs_err(sdp, "mount control error %d\n", error); + goto fail_release; + } + + ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); + return 0; + +fail_release: + dlm_release_lockspace(ls->ls_dlm, 2); +fail_free: + free_recover_size(ls); +fail: + return error; +} + +static void gdlm_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + error = control_first_done(sdp); + if (error) + fs_err(sdp, "mount first_done error %d\n", error); +} + static void gdlm_unmount(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + goto release; + + /* wait for gfs2_control_wq to be done with this mount */ + + spin_lock(&ls->ls_recover_spin); + set_bit(DFL_UNMOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + flush_delayed_work_sync(&sdp->sd_control_work); + + /* mounted_lock and control_lock will be purged in dlm recovery */ +release: if (ls->ls_dlm) { dlm_release_lockspace(ls->ls_dlm, 2); ls->ls_dlm = NULL; } + + free_recover_size(ls); } static const match_table_t dlm_tokens = { @@ -226,6 +1197,8 @@ static const match_table_t dlm_tokens = { const struct lm_lockops gfs2_dlm_ops = { .lm_proto_name = "lock_dlm", .lm_mount = gdlm_mount, + .lm_first_done = gdlm_first_done, + .lm_recovery_result = gdlm_recovery_result, .lm_unmount = gdlm_unmount, .lm_put_lock = gdlm_put_lock, .lm_lock = gdlm_lock, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index c150298e2d8..a8d9bcd0e19 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -28,6 +28,8 @@ #include "recovery.h" #include "dir.h" +struct workqueue_struct *gfs2_control_wq; + static struct shrinker qd_shrinker = { .shrink = gfs2_shrink_qd_memory, .seeks = DEFAULT_SEEKS, @@ -146,12 +148,19 @@ static int __init init_gfs2_fs(void) if (!gfs_recovery_wq) goto fail_wq; + gfs2_control_wq = alloc_workqueue("gfs2_control", + WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0); + if (!gfs2_control_wq) + goto fail_control; + gfs2_register_debugfs(); printk("GFS2 installed\n"); return 0; +fail_control: + destroy_workqueue(gfs_recovery_wq); fail_wq: unregister_filesystem(&gfs2meta_fs_type); fail_unregister: @@ -195,6 +204,7 @@ static void __exit exit_gfs2_fs(void) unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); destroy_workqueue(gfs_recovery_wq); + destroy_workqueue(gfs2_control_wq); rcu_barrier(); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index fe72e79e6ff..6aacf3f230a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp) { char *message = "FIRSTMOUNT=Done"; char *envp[] = { message, NULL }; - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - ls->ls_first_done = 1; + + fs_info(sdp, "first mount done, others may mount\n"); + + if (sdp->sd_lockstruct.ls_ops->lm_first_done) + sdp->sd_lockstruct.ls_ops->lm_first_done(sdp); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } @@ -944,7 +948,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) struct gfs2_args *args = &sdp->sd_args; const char *proto = sdp->sd_proto_name; const char *table = sdp->sd_table_name; - const char *fsname; char *o, *options; int ret; @@ -1004,21 +1007,12 @@ hostdata_error: } } - if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table); - else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, - sdp->sd_lockstruct.ls_jid); - - fsname = strchr(table, ':'); - if (fsname) - fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); complete_all(&sdp->sd_locking_init); return 0; } - ret = lm->lm_mount(sdp, fsname); + ret = lm->lm_mount(sdp, table); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); complete_all(&sdp->sd_locking_init); @@ -1084,7 +1078,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (sdp->sd_args.ar_spectator) { sb->s_flags |= MS_RDONLY; - set_bit(SDF_NORECOVERY, &sdp->sd_flags); + set_bit(SDF_RORECOVERY, &sdp->sd_flags); } if (sdp->sd_args.ar_posix_acl) sb->s_flags |= MS_POSIXACL; @@ -1124,6 +1118,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (error) goto fail; + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); + gfs2_create_debugfs_file(sdp); error = gfs2_sys_fs_add(sdp); @@ -1160,6 +1156,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent goto fail_sb; } + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + sdp->sd_table_name); + else + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); + error = init_inodes(sdp, DO); if (error) goto fail_sb; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index f2a02edcac8..963b2d75200 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, char env_status[20]; char *envp[] = { env_jid, env_status, NULL }; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ls->ls_recover_jid_done = jid; ls->ls_recover_jid_status = message; sprintf(env_jid, "JID=%d", jid); sprintf(env_status, "RECOVERY=%s", message == LM_RD_SUCCESS ? "Done" : "Failed"); kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); + + if (sdp->sd_lockstruct.ls_ops->lm_recovery_result) + sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } void gfs2_recover_func(struct work_struct *work) @@ -512,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; - if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { + ro = 1; + } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) ro = 1; } else { @@ -577,6 +583,7 @@ fail_gunlock_j: fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); fail: + jd->jd_recover_error = error; gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); @@ -605,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); - return 0; + return wait ? jd->jd_recover_error : 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 22234627f68..981bfa32121 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1108,9 +1108,9 @@ void gfs2_inplace_release(struct gfs2_inode *ip) { struct gfs2_blkreserv *rs = ip->i_res; - gfs2_blkrsv_put(ip); if (rs->rs_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); + gfs2_blkrsv_put(ip); } /** diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 443cabcfcd2..d33172c291b 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) ssize_t ret; int val = 0; - if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags)) + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags)) val = 1; ret = sprintf(buf, "%d\n", val); return ret; @@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) val = simple_strtol(buf, NULL, 0); if (val == 1) - set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); else if (val == 0) { - clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); smp_mb__after_clear_bit(); gfs2_glock_thaw(sdp); } else { @@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) goto out; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) goto out; - sdp->sd_lockstruct.ls_first = first; - rv = 0; + sdp->sd_lockstruct.ls_first = first; + rv = 0; out: spin_unlock(&sdp->sd_jindex_spin); return rv ? rv : len; @@ -360,19 +360,14 @@ out: static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - return sprintf(buf, "%d\n", ls->ls_first_done); + return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags)); } -static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) { - unsigned jid; struct gfs2_jdesc *jd; int rv; - rv = sscanf(buf, "%u", &jid); - if (rv != 1) - return -EINVAL; - rv = -ESHUTDOWN; spin_lock(&sdp->sd_jindex_spin); if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) @@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) } out: spin_unlock(&sdp->sd_jindex_spin); + return rv; +} + +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + rv = gfs2_recover_set(sdp, jid); + return rv ? rv : len; } diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h index e94560e836d..79182d6ad6a 100644 --- a/fs/gfs2/sys.h +++ b/fs/gfs2/sys.h @@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp); int gfs2_sys_init(void); void gfs2_sys_uninit(void); +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid); + #endif /* __SYS_DOT_H__ */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e425ad9d049..1e85a7ac021 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page) } static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; diff --git a/fs/ioprio.c b/fs/ioprio.c index f79dab83e17..f84b380d65e 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -48,28 +48,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio) if (err) return err; - task_lock(task); - do { - ioc = task->io_context; - /* see wmb() in current_io_context() */ - smp_read_barrier_depends(); - if (ioc) - break; - - ioc = alloc_io_context(GFP_ATOMIC, -1); - if (!ioc) { - err = -ENOMEM; - break; - } - task->io_context = ioc; - } while (1); - - if (!err) { - ioc->ioprio = ioprio; - ioc->ioprio_changed = 1; + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + ioc_ioprio_changed(ioc, ioprio); + put_io_context(ioc, NULL); } - task_unlock(task); return err; } EXPORT_SYMBOL_GPL(set_task_ioprio); diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 23d7451b293..65ba36b80a9 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(nsm_lock); * Local NSM state */ u32 __read_mostly nsm_local_state; -int __read_mostly nsm_use_hostnames; +bool __read_mostly nsm_use_hostnames; static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) { diff --git a/fs/mpage.c b/fs/mpage.c index fdfae9fa98c..643e9f55ef2 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - struct blk_plug plug; - - blk_start_plug(&plug); map_bh.b_state = 0; map_bh.b_size = 0; @@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, BUG_ON(!list_empty(pages)); if (bio) mpage_bio_submit(READ, bio); - blk_finish_plug(&plug); return 0; } EXPORT_SYMBOL(mpage_readpages); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 277dfaf2e99..31778f74357 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -84,7 +84,7 @@ retry: /* * Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ -static int nfs4_disable_idmapping = 1; +static bool nfs4_disable_idmapping = true; /* * RPC cruft for NFS diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 25c3bfad795..f649fba8c38 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -57,7 +57,7 @@ #define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 /* Default is to see 64-bit inode numbers */ -static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; +static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5ee92538b06..8102db9b926 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, enum migrate_mode); #else #define nfs_migrate_page NULL #endif diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0c3885255f9..834f0fe96f8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1688,7 +1688,7 @@ out_error: #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page) + struct page *page, enum migrate_mode mode) { /* * If PagePrivate is set, then the page is currently associated with @@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 10e6366608f..8df1ea4a6ff 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -80,3 +80,13 @@ config NFSD_V4 available from http://linux-nfs.org/. If unsure, say N. + +config NFSD_FAULT_INJECTION + bool "NFS server manual fault injection" + depends on NFSD_V4 && DEBUG_KERNEL + help + This option enables support for manually injecting faults + into the NFS server. This is intended to be used for + testing error recovery on the NFS client. + + If unsure, say N. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 9b118ee2019..af32ef06b4f 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_NFSD) += nfsd.o nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o +nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 62f3b9074e8..cf8a6bd062f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -87,7 +87,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) struct svc_expkey key; struct svc_expkey *ek = NULL; - if (mesg[mlen-1] != '\n') + if (mlen < 1 || mesg[mlen-1] != '\n') return -EINVAL; mesg[mlen-1] = 0; @@ -1226,12 +1226,12 @@ nfsd_export_init(void) int rv; dprintk("nfsd: initializing export module.\n"); - rv = cache_register(&svc_export_cache); + rv = cache_register_net(&svc_export_cache, &init_net); if (rv) return rv; - rv = cache_register(&svc_expkey_cache); + rv = cache_register_net(&svc_expkey_cache, &init_net); if (rv) - cache_unregister(&svc_export_cache); + cache_unregister_net(&svc_export_cache, &init_net); return rv; } @@ -1255,8 +1255,8 @@ nfsd_export_shutdown(void) dprintk("nfsd: shutting down export module.\n"); - cache_unregister(&svc_expkey_cache); - cache_unregister(&svc_export_cache); + cache_unregister_net(&svc_expkey_cache, &init_net); + cache_unregister_net(&svc_export_cache, &init_net); svcauth_unix_purge(); dprintk("nfsd: export shutdown complete.\n"); diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c new file mode 100644 index 00000000000..ce7f0758d84 --- /dev/null +++ b/fs/nfsd/fault_inject.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> + * + * Uses debugfs to create fault injection points for client testing + */ + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/module.h> + +#include "state.h" +#include "fault_inject.h" + +struct nfsd_fault_inject_op { + char *file; + void (*func)(u64); +}; + +static struct nfsd_fault_inject_op inject_ops[] = { + { + .file = "forget_clients", + .func = nfsd_forget_clients, + }, + { + .file = "forget_locks", + .func = nfsd_forget_locks, + }, + { + .file = "forget_openowners", + .func = nfsd_forget_openowners, + }, + { + .file = "forget_delegations", + .func = nfsd_forget_delegations, + }, + { + .file = "recall_delegations", + .func = nfsd_recall_delegations, + }, +}; + +static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op); +static struct dentry *debug_dir; + +static int nfsd_inject_set(void *op_ptr, u64 val) +{ + struct nfsd_fault_inject_op *op = op_ptr; + + if (val == 0) + printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file); + else + printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val); + + op->func(val); + return 0; +} + +static int nfsd_inject_get(void *data, u64 *val) +{ + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n"); + +void nfsd_fault_inject_cleanup(void) +{ + debugfs_remove_recursive(debug_dir); +} + +int nfsd_fault_inject_init(void) +{ + unsigned int i; + struct nfsd_fault_inject_op *op; + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + + debug_dir = debugfs_create_dir("nfsd", NULL); + if (!debug_dir) + goto fail; + + for (i = 0; i < NUM_INJECT_OPS; i++) { + op = &inject_ops[i]; + if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd)) + goto fail; + } + return 0; + +fail: + nfsd_fault_inject_cleanup(); + return -ENOMEM; +} diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h new file mode 100644 index 00000000000..90bd0570956 --- /dev/null +++ b/fs/nfsd/fault_inject.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> + * + * Function definitions for fault injection + */ + +#ifndef LINUX_NFSD_FAULT_INJECT_H +#define LINUX_NFSD_FAULT_INJECT_H + +#ifdef CONFIG_NFSD_FAULT_INJECTION +int nfsd_fault_inject_init(void); +void nfsd_fault_inject_cleanup(void); +void nfsd_forget_clients(u64); +void nfsd_forget_locks(u64); +void nfsd_forget_openowners(u64); +void nfsd_forget_delegations(u64); +void nfsd_recall_delegations(u64); +#else /* CONFIG_NFSD_FAULT_INJECTION */ +static inline int nfsd_fault_inject_init(void) { return 0; } +static inline void nfsd_fault_inject_cleanup(void) {} +static inline void nfsd_forget_clients(u64 num) {} +static inline void nfsd_forget_locks(u64 num) {} +static inline void nfsd_forget_openowners(u64 num) {} +static inline void nfsd_forget_delegations(u64 num) {} +static inline void nfsd_recall_delegations(u64 num) {} +#endif /* CONFIG_NFSD_FAULT_INJECTION */ + +#endif /* LINUX_NFSD_FAULT_INJECT_H */ diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 55780a22fdb..94096273cd6 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -36,6 +36,7 @@ #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/slab.h> +#include <net/net_namespace.h> #include "idmap.h" #include "nfsd.h" @@ -466,20 +467,20 @@ nfsd_idmap_init(void) { int rv; - rv = cache_register(&idtoname_cache); + rv = cache_register_net(&idtoname_cache, &init_net); if (rv) return rv; - rv = cache_register(&nametoid_cache); + rv = cache_register_net(&nametoid_cache, &init_net); if (rv) - cache_unregister(&idtoname_cache); + cache_unregister_net(&idtoname_cache, &init_net); return rv; } void nfsd_idmap_shutdown(void) { - cache_unregister(&idtoname_cache); - cache_unregister(&nametoid_cache); + cache_unregister_net(&idtoname_cache, &init_net); + cache_unregister_net(&nametoid_cache, &init_net); } static int diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index c5e28ed8bca..896da74ec56 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -266,10 +266,6 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ { __be32 status; - /* Only reclaims from previously confirmed clients are valid */ - if ((status = nfs4_check_open_reclaim(&open->op_clientid))) - return status; - /* We don't know the target directory, and therefore can not * set the change info */ @@ -373,6 +369,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NFS4_OPEN_CLAIM_PREVIOUS: open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + status = nfs4_check_open_reclaim(&open->op_clientid); + if (status) + goto out; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, &cstate->current_fh, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 80a0be9ed00..0b3e875d1ab 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -117,8 +117,7 @@ out_no_tfm: return status; } -int -nfsd4_create_clid_dir(struct nfs4_client *clp) +void nfsd4_create_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char *dname = clp->cl_recdir; @@ -127,13 +126,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); - if (!rec_file || clp->cl_firststate) - return 0; - + if (clp->cl_firststate) + return; clp->cl_firststate = 1; + if (!rec_file) + return; status = nfs4_save_creds(&original_cred); if (status < 0) - return status; + return; dir = rec_file->f_path.dentry; /* lock the parent */ @@ -144,8 +144,15 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) status = PTR_ERR(dentry); goto out_unlock; } - status = -EEXIST; if (dentry->d_inode) + /* + * In the 4.1 case, where we're called from + * reclaim_complete(), records from the previous reboot + * may still be left, so this is OK. + * + * In the 4.0 case, we should never get here; but we may + * as well be forgiving and just succeed silently. + */ goto out_put; status = mnt_want_write_file(rec_file); if (status) @@ -164,7 +171,6 @@ out_unlock: " and is writeable", status, user_recovery_dirname); nfs4_reset_creds(original_cred); - return status; } typedef int (recdir_func)(struct dentry *, struct dentry *); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9ca16dc09e0..e8c98f00967 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -49,12 +49,20 @@ time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; static time_t boot_time; -static stateid_t zerostateid; /* bits all 0 */ -static stateid_t onestateid; /* bits all 1 */ + +#define all_ones {{~0,~0},~0} +static const stateid_t one_stateid = { + .si_generation = ~0, + .si_opaque = all_ones, +}; +static const stateid_t zero_stateid = { + /* all fields zero */ +}; + static u64 current_sessionid = 1; -#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) -#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) +#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) +#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) /* forward declarations */ static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); @@ -133,21 +141,21 @@ unsigned int max_delegations; * Open owner state (share locks) */ -/* hash tables for open owners */ -#define OPEN_OWNER_HASH_BITS 8 -#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS) -#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1) +/* hash tables for lock and open owners */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) -static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) +static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) { unsigned int ret; ret = opaque_hashval(ownername->data, ownername->len); ret += clientid; - return ret & OPEN_OWNER_HASH_MASK; + return ret & OWNER_HASH_MASK; } -static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; +static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; /* hash table for nfs4_file */ #define FILE_HASH_BITS 8 @@ -514,6 +522,7 @@ static void unhash_lockowner(struct nfs4_lockowner *lo) list_del(&lo->lo_owner.so_strhash); list_del(&lo->lo_perstateid); + list_del(&lo->lo_owner_ino_hash); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); @@ -985,12 +994,11 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); if (clp == NULL) return NULL; - clp->cl_name.data = kmalloc(name.len, GFP_KERNEL); + clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); if (clp->cl_name.data == NULL) { kfree(clp); return NULL; } - memcpy(clp->cl_name.data, name.data, name.len); clp->cl_name.len = name.len; return clp; } @@ -1058,7 +1066,6 @@ expire_client(struct nfs4_client *clp) spin_unlock(&recall_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); - list_del_init(&dp->dl_recall_lru); unhash_delegation(dp); } while (!list_empty(&clp->cl_openowners)) { @@ -2301,7 +2308,7 @@ nfsd4_free_slabs(void) nfsd4_free_slab(&deleg_slab); } -static int +int nfsd4_init_slabs(void) { openowner_slab = kmem_cache_create("nfsd4_openowners", @@ -2373,7 +2380,7 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) { - list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); list_add(&oo->oo_perclient, &clp->cl_openowners); } @@ -2436,7 +2443,9 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) struct nfs4_stateowner *so; struct nfs4_openowner *oo; - list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (!so->so_is_open_owner) + continue; if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { oo = openowner(so); renew_client(oo->oo_owner.so_client); @@ -2580,7 +2589,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (open->op_file == NULL) return nfserr_jukebox; - strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); + strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); oo = find_openstateowner_str(strhashval, open); open->op_openowner = oo; if (!oo) { @@ -3123,7 +3132,6 @@ nfs4_laundromat(void) spin_unlock(&recall_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - list_del_init(&dp->dl_recall_lru); unhash_delegation(dp); } test_val = nfsd4_lease; @@ -3718,13 +3726,11 @@ out: } -/* - * Lock owner state (byte-range locks) - */ #define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) -#define LOCK_HASH_BITS 8 -#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) -#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +#define LOCKOWNER_INO_HASH_BITS 8 +#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS) +#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1) static inline u64 end_offset(u64 start, u64 len) @@ -3746,16 +3752,14 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1: NFS4_MAX_UINT64; } -static inline unsigned int -lock_ownerstr_hashval(struct inode *inode, u32 cl_id, - struct xdr_netobj *ownername) +static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername) { return (file_hashval(inode) + cl_id + opaque_hashval(ownername->data, ownername->len)) - & LOCK_HASH_MASK; + & LOCKOWNER_INO_HASH_MASK; } -static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; +static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE]; /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that @@ -3809,23 +3813,39 @@ nevermind: deny->ld_type = NFS4_WRITE_LT; } +static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) +{ + struct nfs4_ol_stateid *lst; + + if (!same_owner_str(&lo->lo_owner, owner, clid)) + return false; + lst = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + return lst->st_file->fi_inode == inode; +} + static struct nfs4_lockowner * find_lockowner_str(struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) { - unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); - struct nfs4_stateowner *op; + unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner); + struct nfs4_lockowner *lo; - list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { - if (same_owner_str(op, owner, clid)) - return lockowner(op); + list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { + if (same_lockowner_ino(lo, inode, clid, owner)) + return lo; } return NULL; } static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) { - list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]); + struct inode *inode = open_stp->st_file->fi_inode; + unsigned int inohash = lockowner_ino_hashval(inode, + clp->cl_clientid.cl_id, &lo->lo_owner.so_owner); + + list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); + list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]); list_add(&lo->lo_perstateid, &open_stp->st_lockowners); } @@ -3834,7 +3854,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has * occurred. * - * strhashval = lock_ownerstr_hashval + * strhashval = ownerstr_hashval */ static struct nfs4_lockowner * @@ -3892,6 +3912,37 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) __set_bit(access, &lock_stp->st_access_bmap); } +__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new) +{ + struct nfs4_file *fi = ost->st_file; + struct nfs4_openowner *oo = openowner(ost->st_stateowner); + struct nfs4_client *cl = oo->oo_owner.so_client; + struct nfs4_lockowner *lo; + unsigned int strhashval; + + lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner); + if (lo) { + if (!cstate->minorversion) + return nfserr_bad_seqid; + /* XXX: a lockowner always has exactly one stateid: */ + *lst = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + return nfs_ok; + } + strhashval = ownerstr_hashval(cl->cl_clientid.cl_id, + &lock->v.new.owner); + lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); + if (lo == NULL) + return nfserr_jukebox; + *lst = alloc_init_lock_stateid(lo, fi, ost); + if (*lst == NULL) { + release_lockowner(lo); + return nfserr_jukebox; + } + *new = true; + return nfs_ok; +} + /* * LOCK operation */ @@ -3907,7 +3958,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file_lock file_lock; struct file_lock conflock; __be32 status = 0; - unsigned int strhashval; + bool new_state = false; int lkflg; int err; @@ -3933,10 +3984,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * lock stateid. */ struct nfs4_ol_stateid *open_stp = NULL; - + + if (nfsd4_has_session(cstate)) + /* See rfc 5661 18.10.3: given clientid is ignored: */ + memcpy(&lock->v.new.clientid, + &cstate->session->se_client->cl_clientid, + sizeof(clientid_t)); + status = nfserr_stale_clientid; - if (!nfsd4_has_session(cstate) && - STALE_CLIENTID(&lock->lk_new_clientid)) + if (STALE_CLIENTID(&lock->lk_new_clientid)) goto out; /* validate and update open stateid and open seqid */ @@ -3948,25 +4004,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; - if (!nfsd4_has_session(cstate) && - !same_clid(&open_sop->oo_owner.so_client->cl_clientid, + if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, &lock->v.new.clientid)) goto out; - /* create lockowner and lock stateid */ - fp = open_stp->st_file; - strhashval = lock_ownerstr_hashval(fp->fi_inode, - open_sop->oo_owner.so_client->cl_clientid.cl_id, - &lock->v.new.owner); - /* XXX: Do we need to check for duplicate stateowners on - * the same file, or should they just be allowed (and - * create new stateids)? */ - status = nfserr_jukebox; - lock_sop = alloc_init_lock_stateowner(strhashval, - open_sop->oo_owner.so_client, open_stp, lock); - if (lock_sop == NULL) - goto out; - lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); - if (lock_stp == NULL) + status = lookup_or_create_lock_state(cstate, open_stp, lock, + &lock_stp, &new_state); + if (status) goto out; } else { /* lock (lock owner + lock stateid) already exists */ @@ -3976,10 +4019,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, NFS4_LOCK_STID, &lock_stp); if (status) goto out; - lock_sop = lockowner(lock_stp->st_stateowner); - fp = lock_stp->st_file; } - /* lock_sop and lock_stp have been created or found */ + lock_sop = lockowner(lock_stp->st_stateowner); + fp = lock_stp->st_file; lkflg = setlkflg(lock->lk_type); status = nfs4_check_openmode(lock_stp, lkflg); @@ -4054,7 +4096,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; } out: - if (status && lock->lk_is_new && lock_sop) + if (status && new_state) release_lockowner(lock_sop); if (!cstate->replay_owner) nfs4_unlock_state(); @@ -4251,7 +4293,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfs4_ol_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; struct list_head matches; - int i; + unsigned int hashval = ownerstr_hashval(clid->cl_id, owner); __be32 status; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", @@ -4266,22 +4308,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, nfs4_lock_state(); status = nfserr_locks_held; - /* XXX: we're doing a linear search through all the lockowners. - * Yipes! For now we'll just hope clients aren't really using - * release_lockowner much, but eventually we have to fix these - * data structures. */ INIT_LIST_HEAD(&matches); - for (i = 0; i < LOCK_HASH_SIZE; i++) { - list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) { - if (!same_owner_str(sop, owner, clid)) - continue; - list_for_each_entry(stp, &sop->so_stateids, - st_perstateowner) { - lo = lockowner(sop); - if (check_for_locks(stp->st_file, lo)) - goto out; - list_add(&lo->lo_list, &matches); - } + + list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) { + if (sop->so_is_open_owner) + continue; + if (!same_owner_str(sop, owner, clid)) + continue; + list_for_each_entry(stp, &sop->so_stateids, + st_perstateowner) { + lo = lockowner(sop); + if (check_for_locks(stp->st_file, lo)) + goto out; + list_add(&lo->lo_list, &matches); } } /* Clients probably won't expect us to return with some (but not all) @@ -4394,16 +4433,127 @@ nfs4_check_open_reclaim(clientid_t *clid) return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad; } +#ifdef CONFIG_NFSD_FAULT_INJECTION + +void nfsd_forget_clients(u64 num) +{ + struct nfs4_client *clp, *next; + int count = 0; + + nfs4_lock_state(); + list_for_each_entry_safe(clp, next, &client_lru, cl_lru) { + nfsd4_remove_clid_dir(clp); + expire_client(clp); + if (++count == num) + break; + } + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Forgot %d clients", count); +} + +static void release_lockowner_sop(struct nfs4_stateowner *sop) +{ + release_lockowner(lockowner(sop)); +} + +static void release_openowner_sop(struct nfs4_stateowner *sop) +{ + release_openowner(openowner(sop)); +} + +static int nfsd_release_n_owners(u64 num, bool is_open_owner, + void (*release_sop)(struct nfs4_stateowner *)) +{ + int i, count = 0; + struct nfs4_stateowner *sop, *next; + + for (i = 0; i < OWNER_HASH_SIZE; i++) { + list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) { + if (sop->so_is_open_owner != is_open_owner) + continue; + release_sop(sop); + if (++count == num) + return count; + } + } + return count; +} + +void nfsd_forget_locks(u64 num) +{ + int count; + + nfs4_lock_state(); + count = nfsd_release_n_owners(num, false, release_lockowner_sop); + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Forgot %d locks", count); +} + +void nfsd_forget_openowners(u64 num) +{ + int count; + + nfs4_lock_state(); + count = nfsd_release_n_owners(num, true, release_openowner_sop); + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Forgot %d open owners", count); +} + +int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *)) +{ + int i, count = 0; + struct nfs4_file *fp, *fnext; + struct nfs4_delegation *dp, *dnext; + + for (i = 0; i < FILE_HASH_SIZE; i++) { + list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) { + list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) { + deleg_func(dp); + if (++count == num) + return count; + } + } + } + + return count; +} + +void nfsd_forget_delegations(u64 num) +{ + unsigned int count; + + nfs4_lock_state(); + count = nfsd_process_n_delegations(num, unhash_delegation); + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Forgot %d delegations", count); +} + +void nfsd_recall_delegations(u64 num) +{ + unsigned int count; + + nfs4_lock_state(); + spin_lock(&recall_lock); + count = nfsd_process_n_delegations(num, nfsd_break_one_deleg); + spin_unlock(&recall_lock); + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Recalled %d delegations", count); +} + +#endif /* CONFIG_NFSD_FAULT_INJECTION */ + /* initialization to perform at module load time: */ -int +void nfs4_state_init(void) { - int i, status; + int i; - status = nfsd4_init_slabs(); - if (status) - return status; for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&conf_id_hashtbl[i]); INIT_LIST_HEAD(&conf_str_hashtbl[i]); @@ -4416,18 +4566,15 @@ nfs4_state_init(void) for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } - for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { - INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); - } - for (i = 0; i < LOCK_HASH_SIZE; i++) { - INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); + for (i = 0; i < OWNER_HASH_SIZE; i++) { + INIT_LIST_HEAD(&ownerstr_hashtbl[i]); } - memset(&onestateid, ~0, sizeof(stateid_t)); + for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) + INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]); INIT_LIST_HEAD(&close_lru); INIT_LIST_HEAD(&client_lru); INIT_LIST_HEAD(&del_recall_lru); reclaim_str_hashtbl_size = 0; - return 0; } static void @@ -4526,7 +4673,6 @@ __nfs4_state_shutdown(void) spin_unlock(&recall_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - list_del_init(&dp->dl_recall_lru); unhash_delegation(dp); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b6fa792d6b8..0ec5a1b9700 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -215,10 +215,9 @@ defer_free(struct nfsd4_compoundargs *argp, static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) { if (p == argp->tmp) { - p = kmalloc(nbytes, GFP_KERNEL); + p = kmemdup(argp->tmp, nbytes, GFP_KERNEL); if (!p) return NULL; - memcpy(p, argp->tmp, nbytes); } else { BUG_ON(p != argp->tmpp); argp->tmpp = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bb4a11d58a5..748eda93ce5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -18,6 +18,7 @@ #include "idmap.h" #include "nfsd.h" #include "cache.h" +#include "fault_inject.h" /* * We have a single directory with several nodes in it. @@ -1128,9 +1129,13 @@ static int __init init_nfsd(void) int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = nfs4_state_init(); /* nfs4 locking state */ + retval = nfsd4_init_slabs(); if (retval) return retval; + nfs4_state_init(); + retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ + if (retval) + goto out_free_slabs; nfsd_stat_init(); /* Statistics */ retval = nfsd_reply_cache_init(); if (retval) @@ -1161,6 +1166,8 @@ out_free_cache: nfsd_reply_cache_shutdown(); out_free_stat: nfsd_stat_shutdown(); + nfsd_fault_inject_cleanup(); +out_free_slabs: nfsd4_free_slabs(); return retval; } @@ -1175,6 +1182,7 @@ static void __exit exit_nfsd(void) nfsd_lockd_shutdown(); nfsd_idmap_shutdown(); nfsd4_free_slabs(); + nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 58134a23fdf..1d1e8589b4c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -104,14 +104,16 @@ static inline int nfsd_v4client(struct svc_rqst *rq) */ #ifdef CONFIG_NFSD_V4 extern unsigned int max_delegations; -int nfs4_state_init(void); +void nfs4_state_init(void); +int nfsd4_init_slabs(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); void nfs4_state_shutdown(void); void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); #else -static inline int nfs4_state_init(void) { return 0; } +static inline void nfs4_state_init(void) { } +static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } static inline int nfs4_state_start(void) { return 0; } static inline void nfs4_state_shutdown(void) { } @@ -338,15 +340,15 @@ static inline u32 nfsd_suppattrs2(u32 minorversion) } /* These will return ERR_INVAL if specified in GETATTR or READDIR. */ -#define NFSD_WRITEONLY_ATTRS_WORD1 \ -(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#define NFSD_WRITEONLY_ATTRS_WORD1 \ + (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) /* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ -#define NFSD_WRITEABLE_ATTRS_WORD0 \ -(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) -#define NFSD_WRITEABLE_ATTRS_WORD1 \ -(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ - | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#define NFSD_WRITEABLE_ATTRS_WORD0 \ + (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) +#define NFSD_WRITEABLE_ATTRS_WORD1 \ + (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) #define NFSD_WRITEABLE_ATTRS_WORD2 0 #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index a3cf38476a1..ffb5df1db94 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -366,6 +366,7 @@ struct nfs4_openowner { struct nfs4_lockowner { struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_owner_ino_hash; /* hash by owner,file */ struct list_head lo_perstateid; /* for lockowners only */ struct list_head lo_list; /* for temporary uses */ }; @@ -482,7 +483,7 @@ extern void nfsd4_shutdown_recdir(void); extern int nfs4_client_to_reclaim(const char *name); extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); extern void nfsd4_recdir_purge_old(void); -extern int nfsd4_create_clid_dir(struct nfs4_client *clp); +extern void nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d25a723b68a..edf6d3ed877 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -594,8 +594,19 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac return error; } -#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction." -#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type" +/* + * NFS junction information is stored in an extended attribute. + */ +#define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs" + +/** + * nfsd4_is_junction - Test if an object could be an NFS junction + * + * @dentry: object to test + * + * Returns 1 if "dentry" appears to contain NFS junction information. + * Otherwise 0 is returned. + */ int nfsd4_is_junction(struct dentry *dentry) { struct inode *inode = dentry->d_inode; @@ -606,7 +617,7 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; if (!(inode->i_mode & S_ISVTX)) return 0; - if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0) + if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) return 0; return 1; } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index e14587d5568..f104d565b68 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -135,9 +135,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; - /* 1 from caller and 1 for being on i_list/g_list */ - BUG_ON(atomic_read(&mark->refcnt) < 2); - spin_lock(&group->mark_lock); if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { @@ -182,6 +179,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) iput(inode); /* + * We don't necessarily have a ref on mark from caller so the above iput + * may have already destroyed it. Don't touch from now on. + */ + + /* * it's possible that this group tried to destroy itself, but this * this mark was simultaneously being freed by inode. If that's the * case, we finish freeing the group here. diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 608be451609..5a4a8af5c40 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3198,7 +3198,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm MODULE_VERSION(NTFS_VERSION); MODULE_LICENSE("GPL"); #ifdef DEBUG -module_param(debug_msgs, bool, 0); +module_param(debug_msgs, bint, 0); MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); #endif diff --git a/fs/pipe.c b/fs/pipe.c index f0e485d54e6..a932ced92a1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; diff --git a/fs/proc/array.c b/fs/proc/array.c index 8c344f037bd..c602b8d20f0 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ); + permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); @@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", pid_nr_ns(pid, ns), tcomm, state, @@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime), + (mm && permitted) ? mm->start_data : 0, + (mm && permitted) ? mm->end_data : 0, + (mm && permitted) ? mm->start_brk : 0); if (mm) mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 8173dfd89cb..5485a5388ec 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -654,6 +654,8 @@ static int proc_pid_permission(struct inode *inode, int mask) bool has_perms; task = get_proc_task(inode); + if (!task) + return -ESRCH; has_perms = has_pid_permissions(pid, task, 1); put_task_struct(task); diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index f744be98cd5..af0b7380259 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, spin_lock(&cache->lock); while (1) { - for (i = 0; i < cache->entries; i++) - if (cache->entry[i].block == block) + for (i = cache->curr_blk, n = 0; n < cache->entries; n++) { + if (cache->entry[i].block == block) { + cache->curr_blk = i; break; + } + i = (i + 1) % cache->entries; + } - if (i == cache->entries) { + if (n == cache->entries) { /* * Block not in cache, if all cache entries are used * go to sleep waiting for one to become available. @@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, goto cleanup; } + cache->curr_blk = 0; cache->next_blk = 0; cache->unused = entries; cache->entries = entries; @@ -332,17 +337,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer, u64 *block, int *offset, int length) { struct squashfs_sb_info *msblk = sb->s_fs_info; - int bytes, copied = length; + int bytes, res = length; struct squashfs_cache_entry *entry; TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); while (length) { entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); - if (entry->error) - return entry->error; - else if (*offset >= entry->length) - return -EIO; + if (entry->error) { + res = entry->error; + goto error; + } else if (*offset >= entry->length) { + res = -EIO; + goto error; + } bytes = squashfs_copy_data(buffer, entry, *offset, length); if (buffer) @@ -358,7 +366,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer, squashfs_cache_put(entry); } - return copied; + return res; + +error: + squashfs_cache_put(entry); + return res; } diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index fd7b3b3bda1..81afbccfa84 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; - inode->i_blocks = ((inode->i_size - - le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1; + inode->i_blocks = (inode->i_size - + le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; squashfs_i(inode)->fragment_block = frag_blk; squashfs_i(inode)->fragment_size = frag_size; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 651f0b31d29..52934a22f29 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -28,6 +28,7 @@ struct squashfs_cache { char *name; int entries; + int curr_blk; int next_blk; int num_waiters; int unused; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index d0858c2d9a4..ecaa2f7bdb8 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -290,7 +290,7 @@ handle_fragments: check_directory_table: /* Sanity check directory_table */ - if (msblk->directory_table >= next_table) { + if (msblk->directory_table > next_table) { err = -EINVAL; goto failed_mount; } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index b09ba2dd8b6..f922cbacdb9 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -38,9 +38,6 @@ DEFINE_SPINLOCK(dbg_lock); -static char dbg_key_buf0[128]; -static char dbg_key_buf1[128]; - static const char *get_key_fmt(int fmt) { switch (fmt) { @@ -103,8 +100,8 @@ static const char *get_dent_type(int type) } } -static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, - char *buffer) +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len) { char *p = buffer; int type = key_type(c, key); @@ -112,45 +109,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { switch (type) { case UBIFS_INO_KEY: - sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key), - get_key_type(type)); + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); break; case UBIFS_DENT_KEY: case UBIFS_XENT_KEY: - sprintf(p, "(%lu, %s, %#08x)", - (unsigned long)key_inum(c, key), - get_key_type(type), key_hash(c, key)); + len -= snprintf(p, len, "(%lu, %s, %#08x)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_hash(c, key)); break; case UBIFS_DATA_KEY: - sprintf(p, "(%lu, %s, %u)", - (unsigned long)key_inum(c, key), - get_key_type(type), key_block(c, key)); + len -= snprintf(p, len, "(%lu, %s, %u)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_block(c, key)); break; case UBIFS_TRUN_KEY: - sprintf(p, "(%lu, %s)", - (unsigned long)key_inum(c, key), - get_key_type(type)); + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); break; default: - sprintf(p, "(bad key type: %#08x, %#08x)", - key->u32[0], key->u32[1]); + len -= snprintf(p, len, "(bad key type: %#08x, %#08x)", + key->u32[0], key->u32[1]); } } else - sprintf(p, "bad key format %d", c->key_fmt); -} - -const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key) -{ - /* dbg_lock must be held */ - sprintf_key(c, key, dbg_key_buf0); - return dbg_key_buf0; -} - -const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key) -{ - /* dbg_lock must be held */ - sprintf_key(c, key, dbg_key_buf1); - return dbg_key_buf1; + len -= snprintf(p, len, "bad key format %d", c->key_fmt); + ubifs_assert(len > 0); + return p; } const char *dbg_ntype(int type) @@ -319,6 +305,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int i, n; union ubifs_key key; const struct ubifs_ch *ch = node; + char key_buf[DBG_KEY_BUF_LEN]; if (dbg_is_tst_rcvry(c)) return; @@ -474,7 +461,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_ino_node *ino = node; key_read(c, &ino->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); printk(KERN_DEBUG "\tcreat_sqnum %llu\n", (unsigned long long)le64_to_cpu(ino->creat_sqnum)); printk(KERN_DEBUG "\tsize %llu\n", @@ -517,7 +505,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int nlen = le16_to_cpu(dent->nlen); key_read(c, &dent->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); printk(KERN_DEBUG "\tinum %llu\n", (unsigned long long)le64_to_cpu(dent->inum)); printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); @@ -541,7 +530,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; key_read(c, &dn->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); printk(KERN_DEBUG "\tsize %u\n", le32_to_cpu(dn->size)); printk(KERN_DEBUG "\tcompr_typ %d\n", @@ -582,7 +572,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) key_read(c, &br->key, &key); printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), - le32_to_cpu(br->len), DBGKEY(&key)); + le32_to_cpu(br->len), + dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); } break; } @@ -934,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c, { int n; const struct ubifs_zbranch *zbr; + char key_buf[DBG_KEY_BUF_LEN]; spin_lock(&dbg_lock); if (znode->parent) @@ -958,12 +951,16 @@ void dbg_dump_znode(const struct ubifs_info *c, printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " "%s\n", n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, - DBGKEY(&zbr->key)); + dbg_snprintf_key(c, &zbr->key, + key_buf, + DBG_KEY_BUF_LEN)); else printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " "%s\n", n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, - DBGKEY(&zbr->key)); + dbg_snprintf_key(c, &zbr->key, + key_buf, + DBG_KEY_BUF_LEN)); } spin_unlock(&dbg_lock); } @@ -1260,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, int err, nlen1, nlen2, cmp; struct ubifs_dent_node *dent1, *dent2; union ubifs_key key; + char key_buf[DBG_KEY_BUF_LEN]; ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); @@ -1290,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, key_read(c, &dent1->key, &key); if (keys_cmp(c, &zbr1->key, &key)) { dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, DBGKEY(&key)); + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); dbg_err("but it should have key %s according to tnc", - DBGKEY(&zbr1->key)); + dbg_snprintf_key(c, &zbr1->key, key_buf, + DBG_KEY_BUF_LEN)); dbg_dump_node(c, dent1); goto out_free; } @@ -1300,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, key_read(c, &dent2->key, &key); if (keys_cmp(c, &zbr2->key, &key)) { dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, DBGKEY(&key)); + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); dbg_err("but it should have key %s according to tnc", - DBGKEY(&zbr2->key)); + dbg_snprintf_key(c, &zbr2->key, key_buf, + DBG_KEY_BUF_LEN)); dbg_dump_node(c, dent2); goto out_free; } @@ -1319,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, dbg_err("2 xent/dent nodes with the same name"); else dbg_err("bad order of colliding key %s", - DBGKEY(&key)); + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); dbg_dump_node(c, dent1); diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 8d9c4681018..ad1a6fee601 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -169,40 +169,39 @@ struct ubifs_global_debug_info { spin_unlock(&dbg_lock); \ } while (0) -const char *dbg_key_str0(const struct ubifs_info *c, - const union ubifs_key *key); -const char *dbg_key_str1(const struct ubifs_info *c, - const union ubifs_key *key); - -/* - * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message - * macros. - */ -#define DBGKEY(key) dbg_key_str0(c, (key)) -#define DBGKEY1(key) dbg_key_str1(c, (key)) - -extern spinlock_t dbg_lock; - -#define ubifs_dbg_msg(type, fmt, ...) do { \ - spin_lock(&dbg_lock); \ - pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \ - spin_unlock(&dbg_lock); \ +#define ubifs_dbg_msg(type, fmt, ...) \ + pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) + +#define DBG_KEY_BUF_LEN 32 +#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \ + char __tmp_key_buf[DBG_KEY_BUF_LEN]; \ + pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \ + dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \ } while (0) /* Just a debugging messages not related to any specific UBIFS subsystem */ -#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__) +#define dbg_msg(fmt, ...) \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__) + /* General messages */ #define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) /* Additional journal messages */ #define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) +#define dbg_jnlk(key, fmt, ...) \ + ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__) /* Additional TNC messages */ #define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) +#define dbg_tnck(key, fmt, ...) \ + ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__) /* Additional lprops messages */ #define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) /* Additional LEB find messages */ #define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) /* Additional mount messages */ #define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) +#define dbg_mntk(key, fmt, ...) \ + ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__) /* Additional I/O messages */ #define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) /* Additional commit messages */ @@ -218,6 +217,7 @@ extern spinlock_t dbg_lock; /* Additional recovery messages */ #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) +extern spinlock_t dbg_lock; extern struct ubifs_global_debug_info ubifs_dbg; static inline int dbg_is_chk_gen(const struct ubifs_info *c) @@ -258,6 +258,8 @@ const char *dbg_cstate(int cmt_state); const char *dbg_jhead(int jhead); const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len); void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode); void dbg_dump_node(const struct ubifs_info *c, const void *node); void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, @@ -345,20 +347,23 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define dbg_dump_stack() #define ubifs_assert_cmt_locked(c) -#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } @@ -368,6 +373,10 @@ static inline const char *dbg_jhead(int jhead) { return ""; } static inline const char * dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key) { return ""; } +static inline const char * +dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, + int len) { return ""; } static inline void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) { return; } static inline void dbg_dump_node(const struct ubifs_info *c, diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index cef0460f4c5..2f438ab2e7a 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; struct ubifs_inode *ui = ubifs_inode(inode); - dbg_jnl("ino %lu, blk %u, len %d, key %s", - (unsigned long)key_inum(c, key), key_block(c, key), len, - DBGKEY(key)); + dbg_jnlk(key, "ino %lu, blk %u, len %d, key ", + (unsigned long)key_inum(c, key), key_block(c, key), len); ubifs_assert(len <= UBIFS_BLOCK_SIZE); data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); @@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, dn = (void *)trun + UBIFS_TRUN_NODE_SZ; blk = new_size >> UBIFS_BLOCK_SHIFT; data_key_init(c, &key, inum, blk); - dbg_jnl("last block key %s", DBGKEY(&key)); + dbg_jnlk(&key, "last block key "); err = ubifs_tnc_lookup(c, &key, dn); if (err == -ENOENT) dlen = 0; /* Not found (so it is a hole) */ diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index ccabaf1164b..b007637f040 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { int err; - dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum, - r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key)); + dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ", + r->lnum, r->offs, r->len, r->deletion, r->sqnum); /* Set c->replay_sqnum to help deal with dangling branches. */ c->replay_sqnum = r->sqnum; @@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, { struct replay_entry *r; - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); @@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, struct replay_entry *r; char *nbuf; - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index e14ee53159d..16ad84d8402 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -505,7 +505,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, { int ret; - dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key)); + dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs); ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, zbr->offs); @@ -519,8 +519,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, ret = 0; } if (ret == 0 && c->replaying) - dbg_mnt("dangling branch LEB %d:%d len %d, key %s", - zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); + dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ", + zbr->lnum, zbr->offs, zbr->len); return ret; } @@ -995,9 +995,9 @@ static int fallible_resolve_collision(struct ubifs_info *c, if (adding || !o_znode) return 0; - dbg_mnt("dangling match LEB %d:%d len %d %s", + dbg_mntk(key, "dangling match LEB %d:%d len %d key ", o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, - o_znode->zbranch[o_n].len, DBGKEY(key)); + o_znode->zbranch[o_n].len); *zn = o_znode; *n = o_n; return 1; @@ -1179,7 +1179,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; unsigned long time = get_seconds(); - dbg_tnc("search key %s", DBGKEY(key)); + dbg_tnck(key, "search key "); ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); znode = c->zroot.znode; @@ -1315,7 +1315,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; unsigned long time = get_seconds(); - dbg_tnc("search and dirty key %s", DBGKEY(key)); + dbg_tnck(key, "search and dirty key "); znode = c->zroot.znode; if (unlikely(!znode)) { @@ -1722,8 +1722,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf, if (!keys_eq(c, &zbr->key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_tnc("looked for key %s found node's key %s", - DBGKEY(&zbr->key), DBGKEY1(&key1)); + dbg_tnck(&zbr->key, "looked for key "); + dbg_tnck(&key1, "found node's key "); goto out_err; } @@ -1776,7 +1776,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) ubifs_err("failed to read from LEB %d:%d, error %d", lnum, offs, err); dbg_dump_stack(); - dbg_tnc("key %s", DBGKEY(&bu->key)); + dbg_tnck(&bu->key, "key "); return err; } @@ -1811,7 +1811,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, int found, n, err; struct ubifs_znode *znode; - dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); + dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name); mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1985,8 +1985,7 @@ again: zp = znode->parent; if (znode->child_cnt < c->fanout) { ubifs_assert(n != c->fanout); - dbg_tnc("inserted at %d level %d, key %s", n, znode->level, - DBGKEY(key)); + dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level); insert_zbranch(znode, zbr, n); @@ -2001,7 +2000,7 @@ again: * Unfortunately, @znode does not have more empty slots and we have to * split it. */ - dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key)); + dbg_tnck(key, "splitting level %d, key ", znode->level); if (znode->alt) /* @@ -2095,7 +2094,7 @@ do_split: } /* Insert new key and branch */ - dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key)); + dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level); insert_zbranch(zi, zbr, n); @@ -2171,7 +2170,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key)); + dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len); found = lookup_level0_dirty(c, key, &znode, &n); if (!found) { struct ubifs_zbranch zbr; @@ -2220,8 +2219,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum, - old_offs, lnum, offs, len, DBGKEY(key)); + dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum, + old_offs, lnum, offs, len); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2303,8 +2302,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name, - DBGKEY(key)); + dbg_tnck(key, "LEB %d:%d, name '%.*s', key ", + lnum, offs, nm->len, nm->name); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2397,7 +2396,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) /* Delete without merge for now */ ubifs_assert(znode->level == 0); ubifs_assert(n >= 0 && n < c->fanout); - dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key)); + dbg_tnck(&znode->zbranch[n].key, "deleting key "); zbr = &znode->zbranch[n]; lnc_free(zbr); @@ -2507,7 +2506,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key) struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("key %s", DBGKEY(key)); + dbg_tnck(key, "key "); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2538,7 +2537,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key)); + dbg_tnck(key, "%.*s, key ", nm->len, nm->name); err = lookup_level0_dirty(c, key, &znode, &n); if (err < 0) goto out_unlock; @@ -2653,7 +2652,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, dbg_dump_znode(c, znode); goto out_unlock; } - dbg_tnc("removing %s", DBGKEY(key)); + dbg_tnck(key, "removing key "); } if (k) { for (i = n + 1 + k; i < znode->child_cnt; i++) @@ -2773,7 +2772,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, struct ubifs_zbranch *zbr; union ubifs_key *dkey; - dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key)); + dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)"); ubifs_assert(is_hash_key(c, key)); mutex_lock(&c->tnc_mutex); @@ -3332,9 +3331,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, out_dump: block = key_block(c, key); - ubifs_err("inode %lu has size %lld, but there are data at offset %lld " - "(data key %s)", (unsigned long)inode->i_ino, size, - ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key)); + ubifs_err("inode %lu has size %lld, but there are data at offset %lld", + (unsigned long)inode->i_ino, size, + ((loff_t)block) << UBIFS_BLOCK_SHIFT); mutex_unlock(&c->tnc_mutex); dbg_dump_inode(c, inode); dbg_dump_stack(); diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index b48db999903..dc28fe6ec07 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, case UBIFS_XENT_KEY: break; default: - dbg_msg("bad key type at slot %d: %s", i, - DBGKEY(&zbr->key)); + dbg_msg("bad key type at slot %d: %d", + i, key_type(c, &zbr->key)); err = 3; goto out_dump; } @@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, zbr->offs); if (err) { - dbg_tnc("key %s", DBGKEY(key)); + dbg_tnck(key, "key "); return err; } @@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, if (!keys_eq(c, key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_tnc("looked for key %s found node's key %s", - DBGKEY(key), DBGKEY1(&key1)); + dbg_tnck(key, "looked for key "); + dbg_tnck(&key1, "but found node's key "); dbg_dump_node(c, node); return -EINVAL; } |