diff options
Diffstat (limited to 'fs')
146 files changed, 8370 insertions, 4067 deletions
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 54f92379272..475f9c597cb 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -61,8 +61,6 @@ do { \ current->pid, __func__, ##args); \ } while (0) -extern spinlock_t autofs4_lock; - /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 1442da4860e..509fe1eb66a 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, return -EBUSY; } else { struct file *pipe = fget(pipefd); + if (!pipe) { + err = -EBADF; + goto out; + } if (!pipe->f_op || !pipe->f_op->write) { err = -EPIPE; fput(pipe); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index f43100b9662..450f529a4ea 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -87,18 +87,70 @@ done: } /* + * Calculate and dget next entry in the subdirs list under root. + */ +static struct dentry *get_next_positive_subdir(struct dentry *prev, + struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct list_head *next; + struct dentry *p, *q; + + spin_lock(&sbi->lookup_lock); + + if (prev == NULL) { + spin_lock(&root->d_lock); + prev = dget_dlock(root); + next = prev->d_subdirs.next; + p = prev; + goto start; + } + + p = prev; + spin_lock(&p->d_lock); +again: + next = p->d_u.d_child.next; +start: + if (next == &root->d_subdirs) { + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + dput(prev); + return NULL; + } + + q = list_entry(next, struct dentry, d_u.d_child); + + spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); + /* Negative dentry - try next */ + if (!simple_positive(q)) { + spin_unlock(&p->d_lock); + p = q; + goto again; + } + dget_dlock(q); + spin_unlock(&q->d_lock); + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + + dput(prev); + + return q; +} + +/* * Calculate and dget next entry in top down tree traversal. */ static struct dentry *get_next_positive_dentry(struct dentry *prev, struct dentry *root) { + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); struct list_head *next; struct dentry *p, *ret; if (prev == NULL) return dget(root); - spin_lock(&autofs4_lock); + spin_lock(&sbi->lookup_lock); relock: p = prev; spin_lock(&p->d_lock); @@ -110,7 +162,7 @@ again: if (p == root) { spin_unlock(&p->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); dput(prev); return NULL; } @@ -140,7 +192,7 @@ again: dget_dlock(ret); spin_unlock(&ret->d_lock); spin_unlock(&p->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); dput(prev); @@ -290,11 +342,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(root); /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) { - spin_unlock(&sbi->fs_lock); - return NULL; - } - managed_dentry_set_transit(root); + if (ino->flags & AUTOFS_INF_PENDING) + goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { struct autofs_info *ino = autofs4_dentry_ino(root); ino->flags |= AUTOFS_INF_EXPIRING; @@ -302,7 +351,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, spin_unlock(&sbi->fs_lock); return root; } - managed_dentry_clear_transit(root); +out: spin_unlock(&sbi->fs_lock); dput(root); @@ -336,13 +385,12 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, timeout = sbi->exp_timeout; dentry = NULL; - while ((dentry = get_next_positive_dentry(dentry, root))) { + while ((dentry = get_next_positive_subdir(dentry, root))) { spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); /* No point expiring a pending mount */ if (ino->flags & AUTOFS_INF_PENDING) - goto cont; - managed_dentry_set_transit(dentry); + goto next; /* * Case 1: (i) indirect mount or top level pseudo direct mount @@ -402,8 +450,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, } } next: - managed_dentry_clear_transit(dentry); -cont: spin_unlock(&sbi->fs_lock); } return NULL; @@ -415,13 +461,13 @@ found: ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); - spin_lock(&autofs4_lock); + spin_lock(&sbi->lookup_lock); spin_lock(&expired->d_parent->d_lock); spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&expired->d_lock); spin_unlock(&expired->d_parent->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); return expired; } @@ -484,8 +530,6 @@ int autofs4_expire_run(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); ino->flags &= ~AUTOFS_INF_EXPIRING; - if (!d_unhashed(dentry)) - managed_dentry_clear_transit(dentry); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -513,9 +557,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_EXPIRING; spin_lock(&dentry->d_lock); - if (ret) - __managed_dentry_clear_transit(dentry); - else { + if (!ret) { if ((IS_ROOT(dentry) || (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))) && diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index e6f84d26f4c..96804a17bbd 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -23,8 +23,6 @@ #include "autofs_i.h" -DEFINE_SPINLOCK(autofs4_lock); - static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); @@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * autofs file system so just let the libfs routines handle * it. */ - spin_lock(&autofs4_lock); + spin_lock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); return -ENOENT; } spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); out: return dcache_dir_open(inode, file); @@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->active_list; list_for_each(p, head) { @@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) dget_dlock(active); spin_unlock(&active->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return active; } next: spin_unlock(&active->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return NULL; } @@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->expiring_list; list_for_each(p, head) { @@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) dget_dlock(expiring); spin_unlock(&expiring->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return expiring; } next: spin_unlock(&expiring->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return NULL; } @@ -275,17 +267,16 @@ static int autofs4_mount_wait(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); - int status; + int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { DPRINTK("waiting for mount name=%.*s", dentry->d_name.len, dentry->d_name.name); status = autofs4_wait(sbi, dentry, NFY_MOUNT); DPRINTK("mount wait done status=%d", status); - ino->last_used = jiffies; - return status; } - return 0; + ino->last_used = jiffies; + return status; } static int do_expire_wait(struct dentry *dentry) @@ -319,9 +310,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; struct dentry *new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; + ino = autofs4_dentry_ino(new); + ino->last_used = jiffies; dput(path->dentry); path->dentry = new; } @@ -338,18 +332,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path) DPRINTK("dentry=%p %.*s", dentry, dentry->d_name.len, dentry->d_name.name); - /* - * Someone may have manually umounted this or it was a submount - * that has gone away. - */ - spin_lock(&dentry->d_lock); - if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { - if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) && - (dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - __managed_dentry_set_transit(path->dentry); - } - spin_unlock(&dentry->d_lock); - /* The daemon never triggers a mount. */ if (autofs4_oz_mode(sbi)) return NULL; @@ -418,18 +400,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path) done: if (!(ino->flags & AUTOFS_INF_EXPIRING)) { /* - * Any needed mounting has been completed and the path updated - * so turn this into a normal dentry so we don't continually - * call ->d_automount() and ->d_manage(). - */ - spin_lock(&dentry->d_lock); - __managed_dentry_clear_transit(dentry); - /* + * Any needed mounting has been completed and the path + * updated so clear DCACHE_NEED_AUTOMOUNT so we don't + * call ->d_automount() on rootless multi-mounts since + * it can lead to an incorrect ELOOP error return. + * * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and * symlinks as in all other cases the dentry will be covered by * an actual mount so ->d_automount() won't be called during * the follow. */ + spin_lock(&dentry->d_lock); if ((!d_mountpoint(dentry) && !list_empty(&dentry->d_subdirs)) || (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) @@ -455,6 +436,8 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { + if (rcu_walk) + return 0; if (!d_mountpoint(dentry)) return -EISDIR; return 0; @@ -612,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = CURRENT_TIME; - spin_lock(&autofs4_lock); - autofs4_add_expiring(dentry); + spin_lock(&sbi->lookup_lock); + __autofs4_add_expiring(dentry); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); return 0; } @@ -686,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) if (!autofs4_oz_mode(sbi)) return -EACCES; - spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&dentry->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return -ENOTEMPTY; } __autofs4_add_expiring(dentry); - spin_unlock(&sbi->lookup_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); if (sbi->version < 5) autofs_clear_leaf_automount_flags(dentry); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 56010056b2e..25435987d6a 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -197,12 +197,12 @@ rename_retry: seq = read_seqbegin(&rename_lock); rcu_read_lock(); - spin_lock(&autofs4_lock); + spin_lock(&sbi->fs_lock); for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) len += tmp->d_name.len + 1; if (!len || --len > NAME_MAX) { - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->fs_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; @@ -218,7 +218,7 @@ rename_retry: p -= tmp->d_name.len; strncpy(p, tmp->d_name.name, tmp->d_name.len); } - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->fs_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; diff --git a/fs/block_dev.c b/fs/block_dev.c index 7d02afb2b7f..c1511c674f5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -55,11 +55,13 @@ EXPORT_SYMBOL(I_BDEV); static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); } static sector_t max_block(struct block_device *bdev) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ccc991c542d..57c3bb2884c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -136,9 +136,8 @@ struct btrfs_inode { * items we think we'll end up using, and reserved_extents is the number * of extent items we've reserved metadata for. */ - spinlock_t accounting_lock; atomic_t outstanding_extents; - int reserved_extents; + atomic_t reserved_extents; /* * ordered_data_close is set by truncate when a file that used diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4d2110eafe2..41d1d7c70e2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -340,6 +340,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + return -ENOMEM; atomic_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -354,6 +356,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + if(!bio) { + kfree(cb); + return -ENOMEM; + } bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; atomic_inc(&cb->pending_bios); @@ -657,8 +663,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, atomic_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - btrfs_lookup_bio_sums(root, inode, comp_bio, - sums); + ret = btrfs_lookup_bio_sums(root, inode, + comp_bio, sums); + BUG_ON(ret); } sums += (comp_bio->bi_size + root->sectorsize - 1) / root->sectorsize; @@ -683,8 +690,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); BUG_ON(ret); - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + BUG_ON(ret); + } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); BUG_ON(ret); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b5baff0dccf..84d7ca1fe0b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -147,10 +147,11 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - spin_lock(&root->node_lock); - eb = root->node; + + rcu_read_lock(); + eb = rcu_dereference(root->node); extent_buffer_get(eb); - spin_unlock(&root->node_lock); + rcu_read_unlock(); return eb; } @@ -165,14 +166,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) while (1) { eb = btrfs_root_node(root); btrfs_tree_lock(eb); - - spin_lock(&root->node_lock); - if (eb == root->node) { - spin_unlock(&root->node_lock); + if (eb == root->node) break; - } - spin_unlock(&root->node_lock); - btrfs_tree_unlock(eb); free_extent_buffer(eb); } @@ -458,10 +453,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else parent_start = 0; - spin_lock(&root->node_lock); - root->node = cow; extent_buffer_get(cow); - spin_unlock(&root->node_lock); + rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); @@ -542,6 +535,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0); + + trace_btrfs_cow_block(root, buf, *cow_ret); + return ret; } @@ -686,6 +682,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur) { cur = read_tree_block(root, blocknr, blocksize, gen); + if (!cur) + return -EIO; } else if (!uptodate) { btrfs_read_buffer(cur, gen); } @@ -732,122 +730,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root, return btrfs_item_offset_nr(leaf, nr - 1); } -/* - * extra debugging checks to make sure all the items in a key are - * well formed and in the proper order - */ -static int check_node(struct btrfs_root *root, struct btrfs_path *path, - int level) -{ - struct extent_buffer *parent = NULL; - struct extent_buffer *node = path->nodes[level]; - struct btrfs_disk_key parent_key; - struct btrfs_disk_key node_key; - int parent_slot; - int slot; - struct btrfs_key cpukey; - u32 nritems = btrfs_header_nritems(node); - - if (path->nodes[level + 1]) - parent = path->nodes[level + 1]; - - slot = path->slots[level]; - BUG_ON(nritems == 0); - if (parent) { - parent_slot = path->slots[level + 1]; - btrfs_node_key(parent, &parent_key, parent_slot); - btrfs_node_key(node, &node_key, 0); - BUG_ON(memcmp(&parent_key, &node_key, - sizeof(struct btrfs_disk_key))); - BUG_ON(btrfs_node_blockptr(parent, parent_slot) != - btrfs_header_bytenr(node)); - } - BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); - if (slot != 0) { - btrfs_node_key_to_cpu(node, &cpukey, slot - 1); - btrfs_node_key(node, &node_key, slot); - BUG_ON(comp_keys(&node_key, &cpukey) <= 0); - } - if (slot < nritems - 1) { - btrfs_node_key_to_cpu(node, &cpukey, slot + 1); - btrfs_node_key(node, &node_key, slot); - BUG_ON(comp_keys(&node_key, &cpukey) >= 0); - } - return 0; -} - -/* - * extra checking to make sure all the items in a leaf are - * well formed and in the proper order - */ -static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, - int level) -{ - struct extent_buffer *leaf = path->nodes[level]; - struct extent_buffer *parent = NULL; - int parent_slot; - struct btrfs_key cpukey; - struct btrfs_disk_key parent_key; - struct btrfs_disk_key leaf_key; - int slot = path->slots[0]; - - u32 nritems = btrfs_header_nritems(leaf); - - if (path->nodes[level + 1]) - parent = path->nodes[level + 1]; - - if (nritems == 0) - return 0; - - if (parent) { - parent_slot = path->slots[level + 1]; - btrfs_node_key(parent, &parent_key, parent_slot); - btrfs_item_key(leaf, &leaf_key, 0); - - BUG_ON(memcmp(&parent_key, &leaf_key, - sizeof(struct btrfs_disk_key))); - BUG_ON(btrfs_node_blockptr(parent, parent_slot) != - btrfs_header_bytenr(leaf)); - } - if (slot != 0 && slot < nritems - 1) { - btrfs_item_key(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); - if (comp_keys(&leaf_key, &cpukey) <= 0) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d offset bad key\n", slot); - BUG_ON(1); - } - if (btrfs_item_offset_nr(leaf, slot - 1) != - btrfs_item_end_nr(leaf, slot)) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d offset bad\n", slot); - BUG_ON(1); - } - } - if (slot < nritems - 1) { - btrfs_item_key(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); - BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); - if (btrfs_item_offset_nr(leaf, slot) != - btrfs_item_end_nr(leaf, slot + 1)) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d offset bad\n", slot); - BUG_ON(1); - } - } - BUG_ON(btrfs_item_offset_nr(leaf, 0) + - btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); - return 0; -} - -static noinline int check_block(struct btrfs_root *root, - struct btrfs_path *path, int level) -{ - return 0; - if (level == 0) - return check_leaf(root, path, level); - return check_node(root, path, level); -} /* * search for key in the extent_buffer. The items start at offset p, @@ -1046,9 +928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - spin_lock(&root->node_lock); - root->node = child; - spin_unlock(&root->node_lock); + rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); btrfs_tree_unlock(child); @@ -1188,7 +1068,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } /* double check we haven't messed things up */ - check_block(root, path, level); if (orig_ptr != btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); @@ -1798,12 +1677,6 @@ cow_done: if (!cow) btrfs_unlock_up_safe(p, level + 1); - ret = check_block(root, p, level); - if (ret) { - ret = -1; - goto done; - } - ret = bin_search(b, key, level, &slot); if (level != 0) { @@ -2130,10 +2003,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); - spin_lock(&root->node_lock); old = root->node; - root->node = c; - spin_unlock(&root->node_lock); + rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ free_extent_buffer(old); @@ -3840,7 +3711,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root unsigned long ptr; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (!ret) { leaf = path->nodes[0]; @@ -4217,6 +4089,7 @@ find_next_key: } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); + BUG_ON(!cur); btrfs_tree_lock(cur); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7f78cc78fdd..d47ce830785 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,6 +28,7 @@ #include <linux/wait.h> #include <linux/slab.h> #include <linux/kobject.h> +#include <trace/events/btrfs.h> #include <asm/kmap_types.h> #include "extent_io.h" #include "extent_map.h" @@ -40,6 +41,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; +extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" @@ -782,9 +784,6 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; - /* if this cluster simply points at a bitmap in the block group */ - bool points_to_bitmap; - struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -1283,6 +1282,7 @@ struct btrfs_root { #define BTRFS_INODE_NODUMP (1 << 8) #define BTRFS_INODE_NOATIME (1 << 9) #define BTRFS_INODE_DIRSYNC (1 << 10) +#define BTRFS_INODE_COMPRESS (1 << 11) /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple @@ -2157,6 +2157,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo); int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, @@ -2227,10 +2229,12 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end); int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes); + u64 num_bytes, u64 *actual_bytes); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 type); +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2392,6 +2396,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -2528,7 +2535,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); -void btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); @@ -2536,7 +2543,7 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_cont_expand(struct inode *inode, loff_t size); +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e807b143b85..bce28f65389 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -483,6 +483,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&head_ref->cluster); mutex_init(&head_ref->mutex); + trace_btrfs_delayed_ref_head(ref, head_ref, action); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { @@ -537,6 +539,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, } full_ref->level = level; + trace_btrfs_delayed_tree_ref(ref, full_ref, action); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { @@ -591,6 +595,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, full_ref->objectid = owner; full_ref->offset = offset; + trace_btrfs_delayed_data_ref(ref, full_ref, action); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f0cad5ae5be..c62f02f6ae6 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -151,7 +151,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root ret = PTR_ERR(dir_item); if (ret == -EEXIST) goto second_insert; - goto out; + goto out_free; } leaf = path->nodes[0]; @@ -170,7 +170,7 @@ second_insert: /* FIXME, use some real flag for selecting the extra index */ if (root == root->fs_info->tree_root) { ret = 0; - goto out; + goto out_free; } btrfs_release_path(root, path); @@ -180,7 +180,7 @@ second_insert: name, name_len); if (IS_ERR(dir_item)) { ret2 = PTR_ERR(dir_item); - goto out; + goto out_free; } leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, location); @@ -192,7 +192,9 @@ second_insert: name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name, name_ptr, name_len); btrfs_mark_buffer_dirty(leaf); -out: + +out_free: + btrfs_free_path(path); if (ret) return ret; @@ -377,6 +379,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + if (verify_dir_item(root, leaf, dir_item)) + return NULL; + total_len = btrfs_item_size_nr(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + @@ -429,3 +434,35 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, } return ret; } + +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item) +{ + u16 namelen = BTRFS_NAME_LEN; + u8 type = btrfs_dir_type(leaf, dir_item); + + if (type >= BTRFS_FT_MAX) { + printk(KERN_CRIT "btrfs: invalid dir item type: %d\n", + (int)type); + return 1; + } + + if (type == BTRFS_FT_XATTR) + namelen = XATTR_NAME_MAX; + + if (btrfs_dir_name_len(leaf, dir_item) > namelen) { + printk(KERN_CRIT "btrfS: invalid dir item name len: %u\n", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ + if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) { + printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + return 0; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 830d261d0e6..d7a7315bd03 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,6 +29,7 @@ #include <linux/crc32c.h> #include <linux/slab.h> #include <linux/migrate.h> +#include <asm/unaligned.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -198,7 +199,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) void btrfs_csum_final(u32 crc, char *result) { - *(__le32 *)result = ~cpu_to_le32(crc); + put_unaligned_le32(~crc, result); } /* @@ -323,6 +324,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, int num_copies = 0; int mirror_num = 0; + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { ret = read_extent_buffer_pages(io_tree, eb, start, 1, @@ -331,6 +333,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, !verify_parent_transid(io_tree, eb, parent_transid)) return ret; + /* + * This buffer's crc is fine, but its contents are corrupted, so + * there is no reason to read the other copies, they won't be + * any less wrong. + */ + if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) + return ret; + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) @@ -419,6 +429,73 @@ static int check_tree_block_fsid(struct btrfs_root *root, return ret; } +#define CORRUPT(reason, eb, root, slot) \ + printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ + "root=%llu, slot=%d\n", reason, \ + (unsigned long long)btrfs_header_bytenr(eb), \ + (unsigned long long)root->objectid, slot) + +static noinline int check_leaf(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + struct btrfs_key key; + struct btrfs_key leaf_key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + if (nritems == 0) + return 0; + + /* Check the 0 item */ + if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("invalid item offset size pair", leaf, root, 0); + return -EIO; + } + + /* + * Check to make sure each items keys are in the correct order and their + * offsets make sense. We only have to loop through nritems-1 because + * we check the current slot against the next slot, which verifies the + * next slot's offset+size makes sense and that the current's slot + * offset is correct. + */ + for (slot = 0; slot < nritems - 1; slot++) { + btrfs_item_key_to_cpu(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &key, slot + 1); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { + CORRUPT("bad key order", leaf, root, slot); + return -EIO; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + CORRUPT("slot offset bad", leaf, root, slot); + return -EIO; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just incase all the items are consistent to eachother, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("slot end outside of leaf", leaf, root, slot); + return -EIO; + } + } + + return 0; +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) { @@ -485,8 +562,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, btrfs_set_buffer_lockdep_class(eb, found_level); ret = csum_tree_block(root, eb, 1); - if (ret) + if (ret) { ret = -EIO; + goto err; + } + + /* + * If this is a leaf block and it is corrupt, set the corrupt bit so + * that we don't try and read the other copies of this block, just + * return -EIO. + */ + if (found_level == 0 && check_leaf(root, eb)) { + set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + ret = -EIO; + } end = min_t(u64, eb->len, PAGE_CACHE_SIZE); end = eb->start + end - 1; @@ -1159,7 +1248,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root, fs_info, location->objectid); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + kfree(root); + return ERR_PTR(-ENOMEM); + } ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); if (ret == 0) { l = path->nodes[0]; @@ -1553,6 +1645,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_bdi; } + fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); @@ -1683,6 +1777,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + /* + * In the long term, we'll store the compression type in the super + * block, and it'll be used for per file compression control. + */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + ret = btrfs_parse_options(tree_root, options); if (ret) { err = ret; @@ -1888,6 +1988,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_alloc_profile = (u64)-1; fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + ret = btrfs_init_space_info(fs_info); + if (ret) { + printk(KERN_ERR "Failed to initial space info: %d\n", ret); + goto fail_block_groups; + } + ret = btrfs_read_block_groups(extent_root); if (ret) { printk(KERN_ERR "Failed to read block groups: %d\n", ret); @@ -1979,9 +2085,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY)) { down_read(&fs_info->cleanup_work_sem); - btrfs_orphan_cleanup(fs_info->fs_root); - btrfs_orphan_cleanup(fs_info->tree_root); + err = btrfs_orphan_cleanup(fs_info->fs_root); + if (!err) + err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); + if (err) { + close_ctree(tree_root); + return ERR_PTR(err); + } } return tree_root; @@ -2356,8 +2467,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { + int err; + root_objectid = gang[i]->root_key.objectid; - btrfs_orphan_cleanup(gang[i]); + err = btrfs_orphan_cleanup(gang[i]); + if (err) + return err; } root_objectid++; } @@ -2868,7 +2983,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, break; /* opt_discard */ - ret = btrfs_error_discard_extent(root, start, end + 1 - start); + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_error_discard_extent(root, start, + end + 1 - start, + NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); btrfs_error_unpin_extent_range(root, start, end); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7b3089b5c2d..f619c3cb13b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -36,8 +36,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); -static int update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -442,7 +440,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * allocate blocks for the tree root we can't do the fast caching since * we likely hold important locks. */ - if (!trans->transaction->in_commit && + if (trans && (!trans->transaction->in_commit) && (root && root != root->fs_info->tree_root)) { spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { @@ -471,7 +469,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, if (load_cache_only) return 0; - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); BUG_ON(!caching_ctl); INIT_LIST_HEAD(&caching_ctl->list); @@ -1740,39 +1738,45 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static void btrfs_issue_discard(struct block_device *bdev, +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); + return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); } static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes) + u64 num_bytes, u64 *actual_bytes) { int ret; - u64 map_length = num_bytes; + u64 discarded_bytes = 0; struct btrfs_multi_bio *multi = NULL; - if (!btrfs_test_opt(root, DISCARD)) - return 0; /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - bytenr, &map_length, &multi, 0); + ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, + bytenr, &num_bytes, &multi, 0); if (!ret) { struct btrfs_bio_stripe *stripe = multi->stripes; int i; - if (map_length > num_bytes) - map_length = num_bytes; for (i = 0; i < multi->num_stripes; i++, stripe++) { - btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - map_length); + ret = btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + stripe->length); + if (!ret) + discarded_bytes += stripe->length; + else if (ret != -EOPNOTSUPP) + break; } kfree(multi); } + if (discarded_bytes && ret == -EOPNOTSUPP) + ret = 0; + + if (actual_bytes) + *actual_bytes = discarded_bytes; + return ret; } @@ -3996,6 +4000,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve; int nr_extents; + int reserved_extents; int ret; if (btrfs_transaction_in_commit(root->fs_info)) @@ -4003,25 +4008,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); - spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; - if (nr_extents > BTRFS_I(inode)->reserved_extents) { - nr_extents -= BTRFS_I(inode)->reserved_extents; + reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + + if (nr_extents > reserved_extents) { + nr_extents -= reserved_extents; to_reserve = calc_trans_metadata_size(root, nr_extents); } else { nr_extents = 0; to_reserve = 0; } - spin_unlock(&BTRFS_I(inode)->accounting_lock); + to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) return ret; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->reserved_extents += nr_extents; + atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents); atomic_inc(&BTRFS_I(inode)->outstanding_extents); - spin_unlock(&BTRFS_I(inode)->accounting_lock); block_rsv_add_bytes(block_rsv, to_reserve, 1); @@ -4036,20 +4040,30 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) struct btrfs_root *root = BTRFS_I(inode)->root; u64 to_free; int nr_extents; + int reserved_extents; num_bytes = ALIGN(num_bytes, root->sectorsize); atomic_dec(&BTRFS_I(inode)->outstanding_extents); WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); - spin_lock(&BTRFS_I(inode)->accounting_lock); - nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); - if (nr_extents < BTRFS_I(inode)->reserved_extents) { - nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; - BTRFS_I(inode)->reserved_extents -= nr_extents; - } else { - nr_extents = 0; - } - spin_unlock(&BTRFS_I(inode)->accounting_lock); + reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + do { + int old, new; + + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); + if (nr_extents >= reserved_extents) { + nr_extents = 0; + break; + } + old = reserved_extents; + nr_extents = reserved_extents - nr_extents; + new = reserved_extents - nr_extents; + old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents, + reserved_extents, new); + if (likely(old == reserved_extents)) + break; + reserved_extents = old; + } while (1); to_free = calc_csum_metadata_size(inode, num_bytes); if (nr_extents > 0) @@ -4223,8 +4237,8 @@ int btrfs_pin_extent(struct btrfs_root *root, * update size of reserved extents. this function may return -EAGAIN * if 'reserve' is true or 'sinfo' is false. */ -static int update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo) +int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo) { int ret = 0; if (sinfo) { @@ -4363,7 +4377,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, if (ret) break; - ret = btrfs_discard_extent(root, start, end + 1 - start); + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, + end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); unpin_extent_range(root, start, end); @@ -4704,10 +4720,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - ret = update_reserved_bytes(cache, buf->len, 0, 0); + ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); if (ret == -EAGAIN) { /* block group became read-only */ - update_reserved_bytes(cache, buf->len, 0, 1); + btrfs_update_reserved_bytes(cache, buf->len, 0, 1); goto out; } @@ -4744,6 +4760,11 @@ pin: } } out: + /* + * Deleting the buffer, clear the corrupt flag since it doesn't matter + * anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); btrfs_put_block_group(cache); } @@ -5191,7 +5212,7 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = update_reserved_bytes(block_group, num_bytes, 1, + ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, (data & BTRFS_BLOCK_GROUP_DATA)); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); @@ -5397,6 +5418,8 @@ again: dump_space_info(sinfo, num_bytes, 1); } + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); + return ret; } @@ -5412,12 +5435,15 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) return -ENOSPC; } - ret = btrfs_discard_extent(root, start, len); + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - update_reserved_bytes(cache, len, 0, 1); + btrfs_update_reserved_bytes(cache, len, 0, 1); btrfs_put_block_group(cache); + trace_btrfs_reserved_extent_free(root, start, len); + return ret; } @@ -5444,7 +5470,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, @@ -5614,7 +5641,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - ret = update_reserved_bytes(block_group, ins->offset, 1, 1); + ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, @@ -6047,6 +6074,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, blocksize, generation); + if (!next) + return -EIO; btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } @@ -6438,10 +6467,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - BUG_ON(!wc); + if (!wc) { + btrfs_free_path(path); + return -ENOMEM; + } btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); @@ -6899,7 +6932,11 @@ static noinline int get_new_locations(struct inode *reloc_inode, } path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + if (exts != *extents) + kfree(exts); + return -ENOMEM; + } cur_pos = extent_key->objectid - offset; last_byte = extent_key->objectid + extent_key->offset; @@ -6941,6 +6978,10 @@ static noinline int get_new_locations(struct inode *reloc_inode, struct disk_extent *old = exts; max *= 2; exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); + if (!exts) { + ret = -ENOMEM; + goto out; + } memcpy(exts, old, sizeof(*exts) * nr); if (old != *extents) kfree(old); @@ -7423,7 +7464,8 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, int ret; new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); - BUG_ON(!new_extent); + if (!new_extent) + return -ENOMEM; ref = btrfs_lookup_leaf_ref(root, leaf->start); BUG_ON(!ref); @@ -7609,7 +7651,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root) reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); BUG_ON(!reloc_root); - btrfs_orphan_cleanup(reloc_root); + ret = btrfs_orphan_cleanup(reloc_root); + BUG_ON(ret); return 0; } @@ -7627,7 +7670,8 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, return 0; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - BUG_ON(!root_item); + if (!root_item) + return -ENOMEM; ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); @@ -7653,7 +7697,7 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, &root_key); - BUG_ON(!reloc_root); + BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; reloc_root->commit_root = NULL; reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; @@ -7906,6 +7950,10 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, eb = read_tree_block(found_root, block_start, block_size, 0); + if (!eb) { + ret = -EIO; + goto out; + } btrfs_tree_lock(eb); BUG_ON(level != btrfs_header_level(eb)); @@ -8621,6 +8669,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, BUG_ON(!block_group); BUG_ON(!block_group->ro); + /* + * Free the reserved super bytes from this block group before + * remove it. + */ + free_excluded_extents(root, block_group); + memcpy(&key, &block_group->key, sizeof(key)); if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | @@ -8724,13 +8778,84 @@ out: return ret; } +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + int ret; + + ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM, 0, 0, + &space_info); + if (ret) + return ret; + + ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA, 0, 0, + &space_info); + if (ret) + return ret; + + ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA, 0, 0, + &space_info); + if (ret) + return ret; + + return ret; +} + int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { return unpin_extent_range(root, start, end); } int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes) + u64 num_bytes, u64 *actual_bytes) +{ + return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); +} + +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) { - return btrfs_discard_extent(root, bytenr, num_bytes); + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 group_trimmed; + u64 start; + u64 end; + u64 trimmed = 0; + int ret = 0; + + cache = btrfs_lookup_block_group(fs_info, range->start); + + while (cache) { + if (cache->key.objectid >= (range->start + range->len)) { + btrfs_put_block_group(cache); + break; + } + + start = max(range->start, cache->key.objectid); + end = min(range->start + range->len, + cache->key.objectid + cache->key.offset); + + if (end - start >= range->minlen) { + if (!block_group_cache_done(cache)) { + ret = cache_block_group(cache, NULL, root, 0); + if (!ret) + wait_block_group_cache_done(cache); + } + ret = btrfs_trim_block_group(cache, + &group_trimmed, + start, + end, + range->minlen); + + trimmed += group_trimmed; + if (ret) { + btrfs_put_block_group(cache); + break; + } + } + + cache = next_block_group(fs_info->tree_root, cache); + } + + range->len = trimmed; + return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b5b92824a27..20ddb28602a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2192,6 +2192,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, else write_flags = WRITE; + trace___extent_writepage(page, inode, wbc); + WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || @@ -3690,6 +3692,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, "wanted %lu %lu\n", (unsigned long long)eb->start, eb->len, start, min_len); WARN_ON(1); + return -EINVAL; } p = extent_buffer_page(eb, i); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9318dfefd59..f62c5442835 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -31,6 +31,7 @@ #define EXTENT_BUFFER_UPTODATE 0 #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 +#define EXTENT_BUFFER_CORRUPT 3 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 4f19a3e1bf3..a6a9d4e8b49 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -48,7 +48,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; file_key.objectid = objectid; file_key.offset = pos; btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); @@ -169,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; if (bio->bi_size > PAGE_CACHE_SIZE * 8) path->reada = 2; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f447b783bb8..656bc0a892b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -45,14 +45,14 @@ * and be replaced with calls into generic code. */ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, - int write_bytes, + size_t write_bytes, struct page **prepared_pages, struct iov_iter *i) { size_t copied = 0; + size_t total_copied = 0; int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); - int total_copied = 0; while (write_bytes > 0) { size_t count = min_t(size_t, @@ -88,9 +88,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, total_copied += copied; /* Return to btrfs_file_aio_write to fault page */ - if (unlikely(copied == 0)) { + if (unlikely(copied == 0)) break; - } if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { offset += copied; @@ -109,8 +108,6 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) { size_t i; for (i = 0; i < num_pages; i++) { - if (!pages[i]) - break; /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty * clear it here @@ -130,13 +127,12 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) * this also makes the decision about creating an inline extent vs * doing real data extents, marking pages dirty and delalloc as required. */ -static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct file *file, - struct page **pages, - size_t num_pages, - loff_t pos, - size_t write_bytes) +static noinline int dirty_and_release_pages(struct btrfs_root *root, + struct file *file, + struct page **pages, + size_t num_pages, + loff_t pos, + size_t write_bytes) { int err = 0; int i; @@ -154,7 +150,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, NULL); - BUG_ON(err); + if (err) + return err; for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; @@ -162,13 +159,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, ClearPageChecked(p); set_page_dirty(p); } - if (end_pos > isize) { + + /* + * we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ + if (end_pos > isize) i_size_write(inode, end_pos); - /* we've only changed i_size in ram, and we haven't updated - * the disk i_size. There is no need to log the inode - * at this time. - */ - } return 0; } @@ -610,6 +608,8 @@ again: key.offset = split; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; @@ -819,12 +819,11 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; if (start_pos > inode->i_size) { - err = btrfs_cont_expand(inode, start_pos); + err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); if (err) return err; } - memset(pages, 0, num_pages * sizeof(struct page *)); again: for (i = 0; i < num_pages; i++) { pages[i] = grab_cache_page(inode->i_mapping, index + i); @@ -896,156 +895,71 @@ fail: } -static ssize_t btrfs_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static noinline ssize_t __btrfs_buffered_write(struct file *file, + struct iov_iter *i, + loff_t pos) { - struct file *file = iocb->ki_filp; struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; - struct iov_iter i; - loff_t *ppos = &iocb->ki_pos; - loff_t start_pos; - ssize_t num_written = 0; - ssize_t err = 0; - size_t count; - size_t ocount; - int ret = 0; - int nrptrs; unsigned long first_index; unsigned long last_index; - int will_write; - int buffered = 0; - int copied = 0; - int dirty_pages = 0; - - will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || - (file->f_flags & O_DIRECT)); - - start_pos = pos; - - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - - mutex_lock(&inode->i_mutex); - - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) - goto out; - count = ocount; - - current->backing_dev_info = inode->i_mapping->backing_dev_info; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) - goto out; - - if (count == 0) - goto out; - - err = file_remove_suid(file); - if (err) - goto out; - - /* - * If BTRFS flips readonly due to some impossible error - * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), - * although we have opened a file as writable, we have - * to stop this write operation to ensure FS consistency. - */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - err = -EROFS; - goto out; - } - - file_update_time(file); - BTRFS_I(inode)->sequence++; - - if (unlikely(file->f_flags & O_DIRECT)) { - num_written = generic_file_direct_write(iocb, iov, &nr_segs, - pos, ppos, count, - ocount); - /* - * the generic O_DIRECT will update in-memory i_size after the - * DIOs are done. But our endio handlers that update the on - * disk i_size never update past the in memory i_size. So we - * need one more update here to catch any additions to the - * file - */ - if (inode->i_size != BTRFS_I(inode)->disk_i_size) { - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - mark_inode_dirty(inode); - } - - if (num_written < 0) { - ret = num_written; - num_written = 0; - goto out; - } else if (num_written == count) { - /* pick up pos changes done by the generic code */ - pos = *ppos; - goto out; - } - /* - * We are going to do buffered for the rest of the range, so we - * need to make sure to invalidate the buffered pages when we're - * done. - */ - buffered = 1; - pos += num_written; - } + size_t num_written = 0; + int nrptrs; + int ret; - iov_iter_init(&i, iov, nr_segs, count, num_written); - nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / + nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out; - } - - /* generic_write_checks can change our pos */ - start_pos = pos; + if (!pages) + return -ENOMEM; first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; + last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT; - while (iov_iter_count(&i) > 0) { + while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(iov_iter_count(&i), + size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); size_t num_pages = (write_bytes + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t dirty_pages; + size_t copied; WARN_ON(num_pages > nrptrs); - memset(pages, 0, sizeof(struct page *) * nrptrs); /* * Fault pages before locking them in prepare_pages * to avoid recursive lock */ - if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) { + if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { ret = -EFAULT; - goto out; + break; } ret = btrfs_delalloc_reserve_space(inode, num_pages << PAGE_CACHE_SHIFT); if (ret) - goto out; + break; + /* + * This is going to setup the pages array with the number of + * pages we want, so we don't really need to worry about the + * contents of pages from loop to loop + */ ret = prepare_pages(root, file, pages, num_pages, pos, first_index, last_index, write_bytes); if (ret) { btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); - goto out; + break; } copied = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, &i); + write_bytes, pages, i); /* * if we have trouble faulting in the pages, fall @@ -1061,6 +975,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* + * If we had a short copy we need to release the excess delaloc + * bytes we reserved. We need to increment outstanding_extents + * because btrfs_delalloc_release_space will decrement it, but + * we still have an outstanding extent for the chunk we actually + * managed to copy. + */ if (num_pages > dirty_pages) { if (copied > 0) atomic_inc( @@ -1071,39 +992,157 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, } if (copied > 0) { - dirty_and_release_pages(NULL, root, file, pages, - dirty_pages, pos, copied); + ret = dirty_and_release_pages(root, file, pages, + dirty_pages, pos, + copied); + if (ret) { + btrfs_delalloc_release_space(inode, + dirty_pages << PAGE_CACHE_SHIFT); + btrfs_drop_pages(pages, num_pages); + break; + } } btrfs_drop_pages(pages, num_pages); - if (copied > 0) { - if (will_write) { - filemap_fdatawrite_range(inode->i_mapping, pos, - pos + copied - 1); - } else { - balance_dirty_pages_ratelimited_nr( - inode->i_mapping, - dirty_pages); - if (dirty_pages < - (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); - btrfs_throttle(root); - } - } + cond_resched(); + + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_pages); + if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); pos += copied; num_written += copied; + } - cond_resched(); + kfree(pages); + + return num_written ? num_written : ret; +} + +static ssize_t __btrfs_direct_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos, + loff_t *ppos, size_t count, size_t ocount) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct iov_iter i; + ssize_t written; + ssize_t written_buffered; + loff_t endbyte; + int err; + + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, + count, ocount); + + /* + * the generic O_DIRECT will update in-memory i_size after the + * DIOs are done. But our endio handlers that update the on + * disk i_size never update past the in memory i_size. So we + * need one more update here to catch any additions to the + * file + */ + if (inode->i_size != BTRFS_I(inode)->disk_i_size) { + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); } + + if (written < 0 || written == count) + return written; + + pos += written; + count -= written; + iov_iter_init(&i, iov, nr_segs, count, written); + written_buffered = __btrfs_buffered_write(file, &i, pos); + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + endbyte = pos + written_buffered - 1; + err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + if (err) + goto out; + written += written_buffered; + *ppos = pos + written_buffered; + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); out: - mutex_unlock(&inode->i_mutex); - if (ret) - err = ret; + return written ? written : err; +} - kfree(pages); - *ppos = pos; +static ssize_t btrfs_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + loff_t *ppos = &iocb->ki_pos; + ssize_t num_written = 0; + ssize_t err = 0; + size_t count, ocount; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + mutex_lock(&inode->i_mutex); + + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + count = ocount; + + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + if (count == 0) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + err = file_remove_suid(file); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + /* + * If BTRFS flips readonly due to some impossible error + * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), + * although we have opened a file as writable, we have + * to stop this write operation to ensure FS consistency. + */ + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + mutex_unlock(&inode->i_mutex); + err = -EROFS; + goto out; + } + + file_update_time(file); + BTRFS_I(inode)->sequence++; + + if (unlikely(file->f_flags & O_DIRECT)) { + num_written = __btrfs_direct_write(iocb, iov, nr_segs, + pos, ppos, count, ocount); + } else { + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, num_written); + + num_written = __btrfs_buffered_write(file, &i, pos); + if (num_written > 0) + *ppos = pos + num_written; + } + + mutex_unlock(&inode->i_mutex); /* * we want to make sure fsync finds this change @@ -1118,43 +1157,12 @@ out: * one running right now. */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; - - if (num_written > 0 && will_write) { - struct btrfs_trans_handle *trans; - - err = btrfs_wait_ordered_range(inode, start_pos, num_written); - if (err) + if (num_written > 0 || num_written == -EIOCBQUEUED) { + err = generic_write_sync(file, pos, num_written); + if (err < 0 && num_written > 0) num_written = err; - - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - num_written = PTR_ERR(trans); - goto done; - } - mutex_lock(&inode->i_mutex); - ret = btrfs_log_dentry_safe(trans, root, - file->f_dentry); - mutex_unlock(&inode->i_mutex); - if (ret == 0) { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - btrfs_end_transaction(trans, root); - else - btrfs_commit_transaction(trans, root); - } else if (ret != BTRFS_NO_LOG_SYNC) { - btrfs_commit_transaction(trans, root); - } else { - btrfs_end_transaction(trans, root); - } - } - if (file->f_flags & O_DIRECT && buffered) { - invalidate_mapping_pages(inode->i_mapping, - start_pos >> PAGE_CACHE_SHIFT, - (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); - } } -done: +out: current->backing_dev_info = NULL; return num_written ? num_written : err; } @@ -1197,6 +1205,7 @@ int btrfs_sync_file(struct file *file, int datasync) int ret = 0; struct btrfs_trans_handle *trans; + trace_btrfs_sync_file(file, datasync); /* we wait first, since the writeback may change the inode */ root->log_batch++; @@ -1324,7 +1333,8 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, alloc_start); + ret = btrfs_cont_expand(inode, i_size_read(inode), + alloc_start); if (ret) goto out; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a0390657451..0037427d8a9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -393,7 +393,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, break; need_loop = 1; - e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); if (!e) { kunmap(page); unlock_page(page); @@ -405,7 +406,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, e->bytes = le64_to_cpu(entry->bytes); if (!e->bytes) { kunmap(page); - kfree(e); + kmem_cache_free(btrfs_free_space_cachep, e); unlock_page(page); page_cache_release(page); goto free_cache; @@ -420,7 +421,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); if (!e->bitmap) { kunmap(page); - kfree(e); + kmem_cache_free( + btrfs_free_space_cachep, e); unlock_page(page); page_cache_release(page); goto free_cache; @@ -1187,7 +1189,7 @@ static void free_bitmap(struct btrfs_block_group_cache *block_group, { unlink_free_space(block_group, bitmap_info); kfree(bitmap_info->bitmap); - kfree(bitmap_info); + kmem_cache_free(btrfs_free_space_cachep, bitmap_info); block_group->total_bitmaps--; recalculate_thresholds(block_group); } @@ -1285,9 +1287,22 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap */ - if (block_group->free_extents < block_group->extents_thresh && - info->bytes > block_group->sectorsize * 4) - return 0; + if (block_group->free_extents < block_group->extents_thresh) { + /* + * If this block group has some small extents we don't want to + * use up all of our free slots in the cache with them, we want + * to reserve them to larger extents, however if we have plent + * of cache left then go ahead an dadd them, no sense in adding + * the overhead of a bitmap if we don't have to. + */ + if (info->bytes <= block_group->sectorsize * 4) { + if (block_group->free_extents * 2 <= + block_group->extents_thresh) + return 0; + } else { + return 0; + } + } /* * some block groups are so tiny they can't be enveloped by a bitmap, so @@ -1342,8 +1357,8 @@ new_bitmap: /* no pre-allocated info, allocate a new one */ if (!info) { - info = kzalloc(sizeof(struct btrfs_free_space), - GFP_NOFS); + info = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); if (!info) { spin_lock(&block_group->tree_lock); ret = -ENOMEM; @@ -1365,7 +1380,7 @@ out: if (info) { if (info->bitmap) kfree(info->bitmap); - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); } return ret; @@ -1398,7 +1413,7 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group, else __unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; - kfree(right_info); + kmem_cache_free(btrfs_free_space_cachep, right_info); merged = true; } @@ -1410,7 +1425,7 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group, __unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; - kfree(left_info); + kmem_cache_free(btrfs_free_space_cachep, left_info); merged = true; } @@ -1423,7 +1438,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *info; int ret = 0; - info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; @@ -1450,7 +1465,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, link: ret = link_free_space(block_group, info); if (ret) - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); out: spin_unlock(&block_group->tree_lock); @@ -1520,7 +1535,7 @@ again: kfree(info->bitmap); block_group->total_bitmaps--; } - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); goto out_lock; } @@ -1556,7 +1571,7 @@ again: /* the hole we're creating ends at the end * of the info struct, just free the info */ - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); } spin_unlock(&block_group->tree_lock); @@ -1629,30 +1644,28 @@ __btrfs_return_cluster_to_free_space( { struct btrfs_free_space *entry; struct rb_node *node; - bool bitmap; spin_lock(&cluster->lock); if (cluster->block_group != block_group) goto out; - bitmap = cluster->points_to_bitmap; cluster->block_group = NULL; cluster->window_start = 0; list_del_init(&cluster->block_group_list); - cluster->points_to_bitmap = false; - - if (bitmap) - goto out; node = rb_first(&cluster->root); while (node) { + bool bitmap; + entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); - BUG_ON(entry->bitmap); - try_merge_free_space(block_group, entry, false); + + bitmap = (entry->bitmap != NULL); + if (!bitmap) + try_merge_free_space(block_group, entry, false); tree_insert_offset(&block_group->free_space_offset, - entry->offset, &entry->offset_index, 0); + entry->offset, &entry->offset_index, bitmap); } cluster->root = RB_ROOT; @@ -1689,7 +1702,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) unlink_free_space(block_group, info); if (info->bitmap) kfree(info->bitmap); - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); if (need_resched()) { spin_unlock(&block_group->tree_lock); cond_resched(); @@ -1722,7 +1735,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, entry->offset += bytes; entry->bytes -= bytes; if (!entry->bytes) - kfree(entry); + kmem_cache_free(btrfs_free_space_cachep, entry); else link_free_space(block_group, entry); } @@ -1775,50 +1788,24 @@ int btrfs_return_cluster_to_free_space( static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, + struct btrfs_free_space *entry, u64 bytes, u64 min_start) { - struct btrfs_free_space *entry; int err; u64 search_start = cluster->window_start; u64 search_bytes = bytes; u64 ret = 0; - spin_lock(&block_group->tree_lock); - spin_lock(&cluster->lock); - - if (!cluster->points_to_bitmap) - goto out; - - if (cluster->block_group != block_group) - goto out; - - /* - * search_start is the beginning of the bitmap, but at some point it may - * be a good idea to point to the actual start of the free area in the - * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only - * to 1 to make sure we get the bitmap entry - */ - entry = tree_search_offset(block_group, - offset_to_bitmap(block_group, search_start), - 1, 0); - if (!entry || !entry->bitmap) - goto out; - search_start = min_start; search_bytes = bytes; err = search_bitmap(block_group, entry, &search_start, &search_bytes); if (err) - goto out; + return 0; ret = search_start; bitmap_clear_bits(block_group, entry, ret, bytes); - if (entry->bytes == 0) - free_bitmap(block_group, entry); -out: - spin_unlock(&cluster->lock); - spin_unlock(&block_group->tree_lock); return ret; } @@ -1836,10 +1823,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct rb_node *node; u64 ret = 0; - if (cluster->points_to_bitmap) - return btrfs_alloc_from_bitmap(block_group, cluster, bytes, - min_start); - spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -1852,9 +1835,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, goto out; entry = rb_entry(node, struct btrfs_free_space, offset_index); - while(1) { - if (entry->bytes < bytes || entry->offset < min_start) { + if (entry->bytes < bytes || + (!entry->bitmap && entry->offset < min_start)) { struct rb_node *node; node = rb_next(&entry->offset_index); @@ -1864,10 +1847,27 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, offset_index); continue; } - ret = entry->offset; - entry->offset += bytes; - entry->bytes -= bytes; + if (entry->bitmap) { + ret = btrfs_alloc_from_bitmap(block_group, + cluster, entry, bytes, + min_start); + if (ret == 0) { + struct rb_node *node; + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + } else { + + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + } if (entry->bytes == 0) rb_erase(&entry->offset_index, &cluster->root); @@ -1884,7 +1884,12 @@ out: block_group->free_space -= bytes; if (entry->bytes == 0) { block_group->free_extents--; - kfree(entry); + if (entry->bitmap) { + kfree(entry->bitmap); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + kmem_cache_free(btrfs_free_space_cachep, entry); } spin_unlock(&block_group->tree_lock); @@ -1904,12 +1909,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, unsigned long found_bits; unsigned long start = 0; unsigned long total_found = 0; + int ret; bool found = false; i = offset_to_bit(entry->offset, block_group->sectorsize, max_t(u64, offset, entry->offset)); - search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); - total_bits = bytes_to_bits(bytes, block_group->sectorsize); + search_bits = bytes_to_bits(bytes, block_group->sectorsize); + total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); again: found_bits = 0; @@ -1926,7 +1932,7 @@ again: } if (!found_bits) - return -1; + return -ENOSPC; if (!found) { start = i; @@ -1950,189 +1956,208 @@ again: cluster->window_start = start * block_group->sectorsize + entry->offset; - cluster->points_to_bitmap = true; + rb_erase(&entry->offset_index, &block_group->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 1); + BUG_ON(ret); return 0; } /* - * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes free and up to empty_size + bytes free. - * We might not find them all in one contiguous area. - * - * returns zero and sets up cluster if things worked out, otherwise - * it returns -enospc + * This searches the block group for just extents to fill the cluster with. */ -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 empty_size) +static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) { + struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; + struct btrfs_free_space *prev = NULL; + struct btrfs_free_space *last; struct rb_node *node; - struct btrfs_free_space *next; - struct btrfs_free_space *last = NULL; - u64 min_bytes; u64 window_start; u64 window_free; - u64 max_extent = 0; - bool found_bitmap = false; - int ret; + u64 max_extent; + u64 max_gap = 128 * 1024; - /* for metadata, allow allocates with more holes */ - if (btrfs_test_opt(root, SSD_SPREAD)) { - min_bytes = bytes + empty_size; - } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - /* - * we want to do larger allocations when we are - * flushing out the delayed refs, it helps prevent - * making more work as we go along. - */ - if (trans->transaction->delayed_refs.flushing) - min_bytes = max(bytes, (bytes + empty_size) >> 1); - else - min_bytes = max(bytes, (bytes + empty_size) >> 4); - } else - min_bytes = max(bytes, (bytes + empty_size) >> 2); - - spin_lock(&block_group->tree_lock); - spin_lock(&cluster->lock); - - /* someone already found a cluster, hooray */ - if (cluster->block_group) { - ret = 0; - goto out; - } -again: - entry = tree_search_offset(block_group, offset, found_bitmap, 1); - if (!entry) { - ret = -ENOSPC; - goto out; - } + entry = tree_search_offset(block_group, offset, 0, 1); + if (!entry) + return -ENOSPC; /* - * If found_bitmap is true, we exhausted our search for extent entries, - * and we just want to search all of the bitmaps that we can find, and - * ignore any extent entries we find. + * We don't want bitmaps, so just move along until we find a normal + * extent entry. */ - while (entry->bitmap || found_bitmap || - (!entry->bitmap && entry->bytes < min_bytes)) { - struct rb_node *node = rb_next(&entry->offset_index); - - if (entry->bitmap && entry->bytes > bytes + empty_size) { - ret = btrfs_bitmap_cluster(block_group, entry, cluster, - offset, bytes + empty_size, - min_bytes); - if (!ret) - goto got_it; - } - - if (!node) { - ret = -ENOSPC; - goto out; - } + while (entry->bitmap) { + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; entry = rb_entry(node, struct btrfs_free_space, offset_index); } - /* - * We already searched all the extent entries from the passed in offset - * to the end and didn't find enough space for the cluster, and we also - * didn't find any bitmaps that met our criteria, just go ahead and exit - */ - if (found_bitmap) { - ret = -ENOSPC; - goto out; - } - - cluster->points_to_bitmap = false; window_start = entry->offset; window_free = entry->bytes; - last = entry; max_extent = entry->bytes; + first = entry; + last = entry; + prev = entry; - while (1) { - /* out window is just right, lets fill it */ - if (window_free >= bytes + empty_size) - break; - - node = rb_next(&last->offset_index); - if (!node) { - if (found_bitmap) - goto again; - ret = -ENOSPC; - goto out; - } - next = rb_entry(node, struct btrfs_free_space, offset_index); + while (window_free <= min_bytes) { + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); - /* - * we found a bitmap, so if this search doesn't result in a - * cluster, we know to go and search again for the bitmaps and - * start looking for space there - */ - if (next->bitmap) { - if (!found_bitmap) - offset = next->offset; - found_bitmap = true; - last = next; + if (entry->bitmap) continue; - } - /* * we haven't filled the empty size and the window is * very large. reset and try again */ - if (next->offset - (last->offset + last->bytes) > 128 * 1024 || - next->offset - window_start > (bytes + empty_size) * 2) { - entry = next; + if (entry->offset - (prev->offset + prev->bytes) > max_gap || + entry->offset - window_start > (min_bytes * 2)) { + first = entry; window_start = entry->offset; window_free = entry->bytes; last = entry; max_extent = entry->bytes; } else { - last = next; - window_free += next->bytes; + last = entry; + window_free += entry->bytes; if (entry->bytes > max_extent) max_extent = entry->bytes; } + prev = entry; } - cluster->window_start = entry->offset; + cluster->window_start = first->offset; + + node = &first->offset_index; /* * now we've found our entries, pull them out of the free space * cache and put them into the cluster rbtree - * - * The cluster includes an rbtree, but only uses the offset index - * of each free space cache entry. */ - while (1) { + do { + int ret; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); - if (entry->bitmap && node) { - entry = rb_entry(node, struct btrfs_free_space, - offset_index); + if (entry->bitmap) continue; - } else if (entry->bitmap && !node) { - break; - } rb_erase(&entry->offset_index, &block_group->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); BUG_ON(ret); + } while (node && entry != last); - if (!node || entry == last) - break; + cluster->max_size = max_extent; + + return 0; +} + +/* + * This specifically looks for bitmaps that may work in the cluster, we assume + * that we have already failed to find extents that will work. + */ +static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = -ENOSPC; + + if (block_group->total_bitmaps == 0) + return -ENOSPC; + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 0, 1); + if (!entry) + return -ENOSPC; + + node = &entry->offset_index; + do { entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + if (!entry->bitmap) + continue; + if (entry->bytes < min_bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, min_bytes); + } while (ret && node); + + return ret; +} + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes free and up to empty_size + bytes free. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) +{ + u64 min_bytes; + int ret; + + /* for metadata, allow allocates with more holes */ + if (btrfs_test_opt(root, SSD_SPREAD)) { + min_bytes = bytes + empty_size; + } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); + + spin_lock(&block_group->tree_lock); + + /* + * If we know we don't have enough space to make a cluster don't even + * bother doing all the work to try and find one. + */ + if (block_group->free_space < min_bytes) { + spin_unlock(&block_group->tree_lock); + return -ENOSPC; } - cluster->max_size = max_extent; -got_it: - ret = 0; - atomic_inc(&block_group->count); - list_add_tail(&cluster->block_group_list, &block_group->cluster_list); - cluster->block_group = block_group; + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } + + ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes, + min_bytes); + if (ret) + ret = setup_cluster_bitmap(block_group, cluster, offset, + bytes, min_bytes); + + if (!ret) { + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, + &block_group->cluster_list); + cluster->block_group = block_group; + } out: spin_unlock(&cluster->lock); spin_unlock(&block_group->tree_lock); @@ -2149,8 +2174,99 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root = RB_ROOT; cluster->max_size = 0; - cluster->points_to_bitmap = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space *entry = NULL; + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 bytes = 0; + u64 actually_trimmed; + int ret = 0; + + *trimmed = 0; + + while (start < end) { + spin_lock(&block_group->tree_lock); + + if (block_group->free_space < minlen) { + spin_unlock(&block_group->tree_lock); + break; + } + + entry = tree_search_offset(block_group, start, 0, 1); + if (!entry) + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, + start), + 1, 1); + + if (!entry || entry->offset >= end) { + spin_unlock(&block_group->tree_lock); + break; + } + + if (entry->bitmap) { + ret = search_bitmap(block_group, entry, &start, &bytes); + if (!ret) { + if (start >= end) { + spin_unlock(&block_group->tree_lock); + break; + } + bytes = min(bytes, end - start); + bitmap_clear_bits(block_group, entry, + start, bytes); + if (entry->bytes == 0) + free_bitmap(block_group, entry); + } else { + start = entry->offset + BITS_PER_BITMAP * + block_group->sectorsize; + spin_unlock(&block_group->tree_lock); + ret = 0; + continue; + } + } else { + start = entry->offset; + bytes = min(entry->bytes, end - start); + unlink_free_space(block_group, entry); + kfree(entry); + } + + spin_unlock(&block_group->tree_lock); + + if (bytes >= minlen) { + int update_ret; + update_ret = btrfs_update_reserved_bytes(block_group, + bytes, 1, 1); + + ret = btrfs_error_discard_extent(fs_info->extent_root, + start, + bytes, + &actually_trimmed); + + btrfs_add_free_space(block_group, + start, bytes); + if (!update_ret) + btrfs_update_reserved_bytes(block_group, + bytes, 0, 1); + + if (ret) + break; + *trimmed += actually_trimmed; + } + start += bytes; + bytes = 0; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index e49ca5c321b..65c3b935289 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -68,4 +68,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, int btrfs_return_cluster_to_free_space( struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster); +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen); #endif diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index c56eb590917..c05a08f4c41 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -30,7 +30,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) int slot; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; search_key.objectid = BTRFS_LAST_FREE_OBJECTID; search_key.type = -1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 119520bdb9a..93c28a1d6bd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@ #include "tree-log.h" #include "compression.h" #include "locking.h" +#include "free-space-cache.h" struct btrfs_iget_args { u64 ino; @@ -70,6 +71,7 @@ static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; +struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { @@ -82,7 +84,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static void btrfs_truncate(struct inode *inode); +static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, @@ -288,6 +291,7 @@ static noinline int add_async_extent(struct async_cow *cow, struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + BUG_ON(!async_extent); async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; @@ -382,9 +386,11 @@ again: */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && (btrfs_test_opt(root, COMPRESS) || - (BTRFS_I(inode)->force_compress))) { + (BTRFS_I(inode)->force_compress) || + (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + BUG_ON(!pages); if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; @@ -1254,7 +1260,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress)) + !(BTRFS_I(inode)->force_compress) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); else @@ -1461,8 +1468,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (bio_flags & EXTENT_BIO_COMPRESSED) { return btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); - } else if (!skip_sum) - btrfs_lookup_bio_sums(root, inode, bio, NULL); + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); + if (ret) + return ret; + } goto mapit; } else if (!skip_sum) { /* csum items have already been cloned */ @@ -1785,6 +1795,8 @@ out: static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); + ClearPagePrivate2(page); return btrfs_finish_ordered_io(page->mapping->host, start, end); } @@ -1895,10 +1907,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, else rw = READ; - BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, + ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, failrec->last_mirror, failrec->bio_flags, 0); - return 0; + return ret; } /* @@ -2282,7 +2294,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) * this cleans up any orphans that may be left on the list from the last use * of this root. */ -void btrfs_orphan_cleanup(struct btrfs_root *root) +int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -2292,10 +2304,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) int ret = 0, nr_unlink = 0, nr_truncate = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) - return; + return 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto out; + } path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; @@ -2304,11 +2319,8 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - printk(KERN_ERR "Error searching slot for orphan: %d" - "\n", ret); - break; - } + if (ret < 0) + goto out; /* * if ret == 0 means we found what we were searching for, which @@ -2316,6 +2328,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * find the key and see if we have stuff that matches */ if (ret > 0) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -2343,7 +2356,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - BUG_ON(IS_ERR(inode)); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } /* * add this inode to the orphan list so btrfs_orphan_del does @@ -2361,7 +2377,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) */ if (is_bad_inode(inode)) { trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); @@ -2370,17 +2389,22 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { + if (!S_ISREG(inode->i_mode)) { + WARN_ON(1); + iput(inode); + continue; + } nr_truncate++; - btrfs_truncate(inode); + ret = btrfs_truncate(inode); } else { nr_unlink++; } /* this will do delete_inode and everything for us */ iput(inode); + if (ret) + goto out; } - btrfs_free_path(path); - root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) @@ -2389,14 +2413,20 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) if (root->orphan_block_rsv || root->orphan_item_inserted) { trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - btrfs_end_transaction(trans, root); + if (!IS_ERR(trans)) + btrfs_end_transaction(trans, root); } if (nr_unlink) printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); if (nr_truncate) printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + +out: + if (ret) + printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); + btrfs_free_path(path); + return ret; } /* @@ -2507,6 +2537,8 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + if (location.objectid == BTRFS_FREE_SPACE_OBJECTID) + inode->i_mapping->flags &= ~__GFP_FS; /* * try to precache a NULL acl entry for files that don't have @@ -2635,10 +2667,10 @@ failed: * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory */ -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, struct inode *inode, - const char *name, int name_len) +static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) { struct btrfs_path *path; int ret = 0; @@ -2710,12 +2742,25 @@ err: btrfs_i_size_write(dir, dir->i_size - name_len * 2); inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; btrfs_update_inode(trans, root, dir); - btrfs_drop_nlink(inode); - ret = btrfs_update_inode(trans, root, inode); out: return ret; } +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + int ret; + ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + if (!ret) { + btrfs_drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + } + return ret; +} + + /* helper to check if there is any shared block in the path */ static int check_path_shared(struct btrfs_root *root, struct btrfs_path *path) @@ -3537,7 +3582,13 @@ out: return ret; } -int btrfs_cont_expand(struct inode *inode, loff_t size) +/* + * This function puts in dummy file extents for the area we're creating a hole + * for. So if we are truncating this file to a larger size we need to insert + * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for + * the range between oldsize and size + */ +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3545,7 +3596,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 mask = root->sectorsize - 1; - u64 hole_start = (inode->i_size + mask) & ~mask; + u64 hole_start = (oldsize + mask) & ~mask; u64 block_end = (size + mask) & ~mask; u64 last_byte; u64 cur_offset; @@ -3590,13 +3641,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) err = btrfs_drop_extents(trans, inode, cur_offset, cur_offset + hole_size, &hint_byte, 1); - BUG_ON(err); + if (err) + break; err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); - BUG_ON(err); + if (err) + break; btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); @@ -3616,81 +3669,41 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) return err; } -static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) +static int btrfs_setsize(struct inode *inode, loff_t newsize) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - unsigned long nr; + loff_t oldsize = i_size_read(inode); int ret; - if (attr->ia_size == inode->i_size) + if (newsize == oldsize) return 0; - if (attr->ia_size > inode->i_size) { - unsigned long limit; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (attr->ia_size > inode->i_sb->s_maxbytes) - return -EFBIG; - if (limit != RLIM_INFINITY && attr->ia_size > limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - } - - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - btrfs_set_trans_block_group(trans, inode); - - ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); - - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - - if (attr->ia_size > inode->i_size) { - ret = btrfs_cont_expand(inode, attr->ia_size); + if (newsize > oldsize) { + i_size_write(inode, newsize); + btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + truncate_pagecache(inode, oldsize, newsize); + ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { - btrfs_truncate(inode); + btrfs_setsize(inode, oldsize); return ret; } - i_size_write(inode, attr->ia_size); - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); + } else { - trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); - trans->block_rsv = root->orphan_block_rsv; - BUG_ON(!trans->block_rsv); + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (newsize == 0) + BTRFS_I(inode)->ordered_data_close = 1; - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - if (inode->i_nlink > 0) { - ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); - } - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - return 0; + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + truncate_setsize(inode, newsize); + ret = btrfs_truncate(inode); } - /* - * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. - */ - if (attr->ia_size == 0) - BTRFS_I(inode)->ordered_data_close = 1; - - /* we don't support swapfiles, so vmtruncate shouldn't fail */ - ret = vmtruncate(inode, attr->ia_size); - BUG_ON(ret); - - return 0; + return ret; } static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) @@ -3707,7 +3720,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setattr_size(inode, attr); + err = btrfs_setsize(inode, attr->ia_size); if (err) return err; } @@ -3730,6 +3743,8 @@ void btrfs_evict_inode(struct inode *inode) unsigned long nr; int ret; + trace_btrfs_inode_evict(inode); + truncate_inode_pages(&inode->i_data, 0); if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || root == root->fs_info->tree_root)) @@ -4072,7 +4087,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); - inode_tree_add(inode); unlock_new_inode(inode); if (new) @@ -4147,8 +4161,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (!IS_ERR(inode) && root != sub_root) { down_read(&root->fs_info->cleanup_work_sem); if (!(inode->i_sb->s_flags & MS_RDONLY)) - btrfs_orphan_cleanup(sub_root); + ret = btrfs_orphan_cleanup(sub_root); up_read(&root->fs_info->cleanup_work_sem); + if (ret) + inode = ERR_PTR(ret); } return inode; @@ -4282,6 +4298,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; + if (verify_dir_item(root, leaf, di)) + break; + name_len = btrfs_dir_name_len(leaf, di); if (name_len <= sizeof(tmp_name)) { name_ptr = tmp_name; @@ -4517,6 +4536,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOMEM); if (dir) { + trace_btrfs_inode_request(dir); + ret = btrfs_set_inode_index(dir, index); if (ret) { iput(inode); @@ -4585,12 +4606,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if ((mode & S_IFREG)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(root, NODATACOW) || + (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; } insert_inode_hash(inode); inode_tree_add(inode); + + trace_btrfs_inode_new(inode); + return inode; fail: if (dir) @@ -4809,7 +4834,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, /* do not allow sys_link's with other subvols of the same device */ if (root->objectid != BTRFS_I(inode)->root->objectid) - return -EPERM; + return -EXDEV; + + if (inode->i_nlink == ~0U) + return -EMLINK; btrfs_inc_nlink(inode); inode->i_ctime = CURRENT_TIME; @@ -5265,6 +5293,9 @@ insert: } write_unlock(&em_tree->lock); out: + + trace_btrfs_get_extent(root, em); + if (path) btrfs_free_path(path); if (trans) { @@ -5748,6 +5779,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) kfree(dip->csums); kfree(dip); + + /* If we had a csum failure make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); dio_end_io(bio, err); } @@ -5849,6 +5884,10 @@ out_done: kfree(dip->csums); kfree(dip); + + /* If we had an error make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); dio_end_io(bio, err); } @@ -5922,9 +5961,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; - } else if (!skip_sum) - btrfs_lookup_bio_sums_dio(root, inode, bio, + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset, csums); + if (ret) + goto err; + } ret = btrfs_map_bio(root, rw, bio, 0, 1); err: @@ -5948,6 +5990,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int nr_pages = 0; u32 *csums = dip->csums; int ret = 0; + int write = rw & REQ_WRITE; bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) @@ -5984,7 +6027,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, goto out_err; } - if (!skip_sum) + /* Write's use the ordered csums */ + if (!write && !skip_sum) csums = csums + nr_pages; start_sector += submit_len >> 9; file_offset += submit_len; @@ -6052,7 +6096,8 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, } dip->csums = NULL; - if (!skip_sum) { + /* Write's use the ordered csum stuff, so we don't need dip->csums */ + if (!write && !skip_sum) { dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); if (!dip->csums) { kfree(dip); @@ -6474,28 +6519,42 @@ out: return ret; } -static void btrfs_truncate(struct inode *inode) +static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + int err = 0; struct btrfs_trans_handle *trans; unsigned long nr; u64 mask = root->sectorsize - 1; - if (!S_ISREG(inode->i_mode)) { - WARN_ON(1); - return; - } - ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) - return; + return ret; btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_orphan_add(trans, inode); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + /* Now start a transaction for the truncate */ trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; @@ -6522,29 +6581,38 @@ static void btrfs_truncate(struct inode *inode) while (1) { if (!trans) { trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; } ret = btrfs_block_rsv_check(trans, root, root->orphan_block_rsv, 0, 5); - if (ret) { - BUG_ON(ret != -EAGAIN); + if (ret == -EAGAIN) { ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (ret) + return ret; trans = NULL; continue; + } else if (ret) { + err = ret; + break; } ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -EAGAIN) + if (ret != -EAGAIN) { + err = ret; break; + } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + if (ret) { + err = ret; + break; + } nr = trans->blocks_used; btrfs_end_transaction(trans, root); @@ -6554,16 +6622,27 @@ static void btrfs_truncate(struct inode *inode) if (ret == 0 && inode->i_nlink > 0) { ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); + if (ret) + err = ret; + } else if (ret && inode->i_nlink > 0) { + /* + * Failed to do the truncate, remove us from the in memory + * orphan list. + */ + ret = btrfs_orphan_del(NULL, inode); } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + if (ret && !err) + err = ret; nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); - BUG_ON(ret); + if (ret && !err) + err = ret; btrfs_btree_balance_dirty(root, nr); + + return err; } /* @@ -6630,9 +6709,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; - spin_lock_init(&ei->accounting_lock); atomic_set(&ei->outstanding_extents, 0); - ei->reserved_extents = 0; + atomic_set(&ei->reserved_extents, 0); ei->ordered_data_close = 0; ei->orphan_meta_reserved = 0; @@ -6668,7 +6746,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); - WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); /* * This can happen where we create an inode, but somebody else also @@ -6760,6 +6838,8 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_transaction_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); + if (btrfs_free_space_cachep) + kmem_cache_destroy(btrfs_free_space_cachep); } int btrfs_init_cachep(void) @@ -6788,6 +6868,12 @@ int btrfs_init_cachep(void) if (!btrfs_path_cachep) goto fail; + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", + sizeof(struct btrfs_free_space), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -6806,6 +6892,26 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } +/* + * If a file is moved, it will inherit the cow and compression flags of the new + * directory. + */ +static void fixup_inode_flags(struct inode *dir, struct inode *inode) +{ + struct btrfs_inode *b_dir = BTRFS_I(dir); + struct btrfs_inode *b_inode = BTRFS_I(inode); + + if (b_dir->flags & BTRFS_INODE_NODATACOW) + b_inode->flags |= BTRFS_INODE_NODATACOW; + else + b_inode->flags &= ~BTRFS_INODE_NODATACOW; + + if (b_dir->flags & BTRFS_INODE_COMPRESS) + b_inode->flags |= BTRFS_INODE_COMPRESS; + else + b_inode->flags &= ~BTRFS_INODE_COMPRESS; +} + static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -6908,11 +7014,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.name, old_dentry->d_name.len); } else { - btrfs_inc_nlink(old_dentry->d_inode); - ret = btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); } BUG_ON(ret); @@ -6939,6 +7046,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + fixup_inode_flags(new_dir, old_inode); + ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); @@ -7355,7 +7464,6 @@ static const struct address_space_operations btrfs_symlink_aops = { }; static const struct inode_operations btrfs_file_inode_operations = { - .truncate = btrfs_truncate, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .setxattr = btrfs_setxattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d1bace3df9b..7c07fe26b7c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -40,6 +40,7 @@ #include <linux/xattr.h> #include <linux/vmalloc.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -138,6 +139,24 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg) return 0; } +static int check_flags(unsigned int flags) +{ + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL | \ + FS_NOCOMP_FL | FS_COMPR_FL | \ + FS_NOCOW_FL | FS_COW_FL)) + return -EOPNOTSUPP; + + if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) + return -EINVAL; + + if ((flags & FS_NOCOW_FL) && (flags & FS_COW_FL)) + return -EINVAL; + + return 0; +} + static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { struct inode *inode = file->f_path.dentry->d_inode; @@ -153,10 +172,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ - FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL | FS_DIRSYNC_FL)) - return -EOPNOTSUPP; + ret = check_flags(flags); + if (ret) + return ret; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -201,6 +219,22 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) else ip->flags &= ~BTRFS_INODE_DIRSYNC; + /* + * The COMPRESS flag can only be changed by users, while the NOCOMPRESS + * flag may be changed automatically if compression code won't make + * things smaller. + */ + if (flags & FS_NOCOMP_FL) { + ip->flags &= ~BTRFS_INODE_COMPRESS; + ip->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & FS_COMPR_FL) { + ip->flags |= BTRFS_INODE_COMPRESS; + ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + } + if (flags & FS_NOCOW_FL) + ip->flags |= BTRFS_INODE_NODATACOW; + else if (flags & FS_COW_FL) + ip->flags &= ~BTRFS_INODE_NODATACOW; trans = btrfs_join_transaction(root, 1); BUG_ON(IS_ERR(trans)); @@ -213,9 +247,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_end_transaction(trans, root); mnt_drop_write(file->f_path.mnt); + + ret = 0; out_unlock: mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int btrfs_ioctl_getversion(struct file *file, int __user *arg) @@ -225,6 +261,49 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) return put_user(inode->i_generation, arg); } +static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) +{ + struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device *device; + struct request_queue *q; + struct fstrim_range range; + u64 minlen = ULLONG_MAX; + u64 num_devices = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + q = bdev_get_queue(device->bdev); + if (blk_queue_discard(q)) { + num_devices++; + minlen = min((u64)q->limits.discard_granularity, + minlen); + } + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + if (!num_devices) + return -EOPNOTSUPP; + + if (copy_from_user(&range, arg, sizeof(range))) + return -EFAULT; + + range.minlen = max(range.minlen, minlen); + ret = btrfs_trim_fs(root, &range); + if (ret < 0) + return ret; + + if (copy_to_user(arg, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, @@ -409,7 +488,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (ret) goto fail; - btrfs_orphan_cleanup(pending_snapshot->snap); + ret = btrfs_orphan_cleanup(pending_snapshot->snap); + if (ret) + goto fail; parent = dget_parent(dentry); inode = btrfs_lookup_dentry(parent->d_inode, dentry); @@ -2348,12 +2429,15 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; struct btrfs_trans_handle *trans; u64 transid; + int ret; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); transid = trans->transid; - btrfs_commit_transaction_async(trans, root, 0); + ret = btrfs_commit_transaction_async(trans, root, 0); + if (ret) + return ret; if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) @@ -2388,6 +2472,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_setflags(file, argp); case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); + case FITRIM: + return btrfs_ioctl_fitrim(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SNAP_CREATE_V2: diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 083a5547737..a1c94042530 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -202,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); + trace_btrfs_ordered_extent_add(inode, entry); + spin_lock(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); @@ -387,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; + trace_btrfs_ordered_extent_put(entry->inode, entry); + if (atomic_dec_and_test(&entry->refs)) { while (!list_empty(&entry->list)) { cur = entry->list.next; @@ -420,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); + trace_btrfs_ordered_extent_remove(inode, entry); + /* * we have no more ordered extents for this inode and * no dirty pages. We can safely remove it from the @@ -585,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode, u64 start = entry->file_offset; u64 end = start + entry->len - 1; + trace_btrfs_ordered_extent_start(inode, entry); + /* * pages in the range can be dirty, clean or writeback. We * start IO on any dirty ones so the wait doesn't stall waiting diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 31ade5802ae..58250e09eb0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1724,6 +1724,7 @@ again: eb = read_tree_block(dest, old_bytenr, blocksize, old_ptr_gen); + BUG_ON(!eb); btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, @@ -2513,6 +2514,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = btrfs_level_size(root, node->level); generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, blocksize, generation); + if (!eb) { + err = -EIO; + goto next; + } btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); @@ -2670,6 +2675,7 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.objectid, block->key.offset); + BUG_ON(!eb); WARN_ON(btrfs_header_level(eb) != block->level); if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); @@ -4209,7 +4215,7 @@ out: if (IS_ERR(fs_root)) err = PTR_ERR(fs_root); else - btrfs_orphan_cleanup(fs_root); + err = btrfs_orphan_cleanup(fs_root); } return err; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 6a1086e83ff..29b2d7c930e 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -88,7 +88,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, search_key.offset = (u64)-1; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto out; @@ -332,7 +333,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) goto out; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d39a9895d93..2edfc039f09 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -52,6 +52,9 @@ #include "export.h" #include "compression.h" +#define CREATE_TRACE_POINTS +#include <trace/events/btrfs.h> + static const struct super_operations btrfs_super_ops; static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, @@ -620,6 +623,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_root *root = btrfs_sb(sb); int ret; + trace_btrfs_sync_fs(wait); + if (!wait) { filemap_flush(root->fs_info->btree_inode->i_mapping); return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3d73c8d93bb..ce48eb59d61 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -57,7 +57,8 @@ static noinline int join_transaction(struct btrfs_root *root) if (!cur_trans) { cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); - BUG_ON(!cur_trans); + if (!cur_trans) + return -ENOMEM; root->fs_info->generation++; cur_trans->num_writers = 1; cur_trans->num_joined = 0; @@ -195,7 +196,11 @@ again: wait_current_trans(root); ret = join_transaction(root); - BUG_ON(ret); + if (ret < 0) { + if (type != TRANS_JOIN_NOLOCK) + mutex_unlock(&root->fs_info->trans_mutex); + return ERR_PTR(ret); + } cur_trans = root->fs_info->running_transaction; cur_trans->use_count++; @@ -1156,7 +1161,8 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans; ac = kmalloc(sizeof(*ac), GFP_NOFS); - BUG_ON(!ac); + if (!ac) + return -ENOMEM; INIT_DELAYED_WORK(&ac->work, do_async_commit); ac->root = root; @@ -1389,6 +1395,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); + trace_btrfs_transaction_commit(root); + mutex_unlock(&root->fs_info->trans_mutex); if (current->journal_info == trans) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a4bbb854dfd..c50271ad315 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -799,12 +799,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *dir; int ret; struct btrfs_inode_ref *ref; - struct btrfs_dir_item *di; struct inode *inode; char *name; int namelen; unsigned long ref_ptr; unsigned long ref_end; + int search_done = 0; /* * it is possible that we didn't log all the parent directories @@ -845,7 +845,10 @@ again: * existing back reference, and we don't want to create * dangling pointers in the directory. */ -conflict_again: + + if (search_done) + goto insert; + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); if (ret == 0) { char *victim_name; @@ -886,37 +889,21 @@ conflict_again: ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); - kfree(victim_name); - btrfs_release_path(root, path); - goto conflict_again; } kfree(victim_name); ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } BUG_ON(ret); - } - btrfs_release_path(root, path); - - /* look for a conflicting sequence number */ - di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, - btrfs_inode_ref_index(eb, ref), - name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); - } - btrfs_release_path(root, path); - - /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, - name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); + /* + * NOTE: we have searched root tree and checked the + * coresponding ref, it does not need to check again. + */ + search_done = 1; } btrfs_release_path(root, path); +insert: /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, btrfs_inode_ref_index(eb, ref)); @@ -1286,6 +1273,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) + return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); BUG_ON(ret); @@ -1412,6 +1401,11 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) { + ret = -EIO; + goto out; + } + name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); if (!name) { @@ -1821,7 +1815,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, int orig_level; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; level = btrfs_header_level(log->node); orig_level = level; @@ -3107,9 +3102,11 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) .stage = 0, }; - fs_info->log_root_recovering = 1; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + + fs_info->log_root_recovering = 1; trans = btrfs_start_transaction(fs_info->tree_root, 0); BUG_ON(IS_ERR(trans)); @@ -3117,7 +3114,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) wc.trans = trans; wc.pin = 1; - walk_log_tree(trans, log_root_tree, &wc); + ret = walk_log_tree(trans, log_root_tree, &wc); + BUG_ON(ret); again: key.objectid = BTRFS_TREE_LOG_OBJECTID; @@ -3141,8 +3139,7 @@ again: log = btrfs_read_fs_root_no_radix(log_root_tree, &found_key); - BUG_ON(!log); - + BUG_ON(IS_ERR(log)); tmp_key.objectid = found_key.offset; tmp_key.type = BTRFS_ROOT_ITEM_KEY; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9d554e8e658..309a57b9fc8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -33,17 +33,6 @@ #include "volumes.h" #include "async-thread.h" -struct map_lookup { - u64 type; - int io_align; - int io_width; - int stripe_len; - int sector_size; - int num_stripes; - int sub_stripes; - struct btrfs_bio_stripe stripes[]; -}; - static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -1879,6 +1868,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, BUG_ON(ret); + trace_btrfs_chunk_free(root, map, chunk_offset, em->len); + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); BUG_ON(ret); @@ -2606,6 +2597,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, *num_bytes = chunk_bytes_by_type(type, calc_size, map->num_stripes, sub_stripes); + trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes); + em = alloc_extent_map(GFP_NOFS); if (!em) { ret = -ENOMEM; @@ -2714,6 +2707,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, item_size); BUG_ON(ret); } + kfree(chunk); return 0; } @@ -2918,7 +2912,10 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; + u64 stripe_end_offset; u64 stripe_nr; + u64 stripe_nr_orig; + u64 stripe_nr_end; int stripes_allocated = 8; int stripes_required = 1; int stripe_index; @@ -2927,7 +2924,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int max_errors = 0; struct btrfs_multi_bio *multi = NULL; - if (multi_ret && !(rw & REQ_WRITE)) + if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) stripes_allocated = 1; again: if (multi_ret) { @@ -2968,7 +2965,15 @@ again: max_errors = 1; } } - if (multi_ret && (rw & REQ_WRITE) && + if (rw & REQ_DISCARD) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID10)) { + stripes_required = map->num_stripes; + } + } + if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2988,12 +2993,15 @@ again: /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { + if (rw & REQ_DISCARD) + *length = min_t(u64, em->len - offset, *length); + else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, - map->stripe_len - stripe_offset); + map->stripe_len - stripe_offset); } else { *length = em->len - offset; } @@ -3003,8 +3011,19 @@ again: num_stripes = 1; stripe_index = 0; - if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & REQ_WRITE) + stripe_nr_orig = stripe_nr; + stripe_nr_end = (offset + *length + map->stripe_len - 1) & + (~(map->stripe_len - 1)); + do_div(stripe_nr_end, map->stripe_len); + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + *length); + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->num_stripes, + stripe_nr_end - stripe_nr_orig); + stripe_index = do_div(stripe_nr, map->num_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + if (rw & (REQ_WRITE | REQ_DISCARD)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -3015,7 +3034,7 @@ again: } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & REQ_WRITE) + if (rw & (REQ_WRITE | REQ_DISCARD)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -3028,6 +3047,10 @@ again: if (rw & REQ_WRITE) num_stripes = map->sub_stripes; + else if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->sub_stripes * + (stripe_nr_end - stripe_nr_orig), + map->num_stripes); else if (mirror_num) stripe_index += mirror_num - 1; else { @@ -3045,12 +3068,101 @@ again: } BUG_ON(stripe_index >= map->num_stripes); - for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = map->stripes[stripe_index].dev; - stripe_index++; + if (rw & REQ_DISCARD) { + for (i = 0; i < num_stripes; i++) { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + multi->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + u64 stripes; + u32 last_stripe = 0; + int j; + + div_u64_rem(stripe_nr_end - 1, + map->num_stripes, + &last_stripe); + + for (j = 0; j < map->num_stripes; j++) { + u32 test; + + div_u64_rem(stripe_nr_end - 1 - j, + map->num_stripes, &test); + if (test == stripe_index) + break; + } + stripes = stripe_nr_end - 1 - j; + do_div(stripes, map->num_stripes); + multi->stripes[i].length = map->stripe_len * + (stripes - stripe_nr + 1); + + if (i == 0) { + multi->stripes[i].length -= + stripe_offset; + stripe_offset = 0; + } + if (stripe_index == last_stripe) + multi->stripes[i].length -= + stripe_end_offset; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + u64 stripes; + int j; + int factor = map->num_stripes / + map->sub_stripes; + u32 last_stripe = 0; + + div_u64_rem(stripe_nr_end - 1, + factor, &last_stripe); + last_stripe *= map->sub_stripes; + + for (j = 0; j < factor; j++) { + u32 test; + + div_u64_rem(stripe_nr_end - 1 - j, + factor, &test); + + if (test == + stripe_index / map->sub_stripes) + break; + } + stripes = stripe_nr_end - 1 - j; + do_div(stripes, factor); + multi->stripes[i].length = map->stripe_len * + (stripes - stripe_nr + 1); + + if (i < map->sub_stripes) { + multi->stripes[i].length -= + stripe_offset; + if (i == map->sub_stripes - 1) + stripe_offset = 0; + } + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + map->sub_stripes - 1)) { + multi->stripes[i].length -= + stripe_end_offset; + } + } else + multi->stripes[i].length = *length; + + stripe_index++; + if (stripe_index == map->num_stripes) { + /* This could only happen for RAID0/10 */ + stripe_index = 0; + stripe_nr++; + } + } + } else { + for (i = 0; i < num_stripes; i++) { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + multi->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; + } } if (multi_ret) { *multi_ret = multi; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7fb59d45fe8..cc2eadaf7a2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -126,6 +126,7 @@ struct btrfs_fs_devices { struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; + u64 length; /* only used for discard mappings */ }; struct btrfs_multi_bio { @@ -145,6 +146,17 @@ struct btrfs_device_info { u64 max_avail; }; +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + /* Used to sort the devices by max_avail(descending sort) */ int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index d779cefcfd7..a5303b871b1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -242,6 +242,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) break; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + if (verify_dir_item(root, leaf, di)) + continue; name_len = btrfs_dir_name_len(leaf, di); total_size += name_len + 1; diff --git a/fs/buffer.c b/fs/buffer.c index 2e6b1a387b7..a08bb8e61c6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1138,7 +1138,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and the global inode_lock. + * mapping->tree_lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 561438b6a50..37368ba2e67 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -92,7 +92,7 @@ static int ceph_set_page_dirty(struct page *page) ci->i_head_snapc = ceph_get_snap_context(snapc); ++ci->i_wrbuffer_ref_head; if (ci->i_wrbuffer_ref == 0) - igrab(inode); + ihold(inode); ++ci->i_wrbuffer_ref; dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " "snapc %p seq %lld (%d snaps)\n", diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index f40b9139e43..0aee66b92af 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -463,8 +463,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, capsnap, snapc); - igrab(inode); - + ihold(inode); + atomic_set(&capsnap->nref, 1); capsnap->ci = ci; INIT_LIST_HEAD(&capsnap->ci_item); diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 06d27a41807..af56ad56a89 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -61,4 +61,13 @@ void coda_sysctl_clean(void) fs_table_header = NULL; } } + +#else +void coda_sysctl_init(void) +{ +} + +void coda_sysctl_clean(void) +{ +} #endif diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 816f88e6b9c..98b77c89494 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -8,6 +8,7 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> +#include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches; @@ -16,20 +17,23 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - if (inode->i_mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(toput_inode); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index bfd8b680e64..d2a70a4561f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -266,7 +266,6 @@ void ecryptfs_destroy_mount_crypt_stat( &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { list_del(&auth_tok->mount_crypt_stat_list); - mount_crypt_stat->num_global_auth_toks--; if (auth_tok->global_auth_tok_key && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) key_put(auth_tok->global_auth_tok_key); @@ -1389,6 +1388,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = -ENOMEM; goto out; } + /* Zeroed page ensures the in-header unencrypted i_size is set to 0 */ rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat, ecryptfs_dentry); if (unlikely(rc)) { diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index e00753496e3..bd3cafd0949 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -233,7 +233,7 @@ ecryptfs_get_key_payload_data(struct key *key) struct ecryptfs_key_sig { struct list_head crypt_stat_list; - char keysig[ECRYPTFS_SIG_SIZE_HEX]; + char keysig[ECRYPTFS_SIG_SIZE_HEX + 1]; }; struct ecryptfs_filename { @@ -257,19 +257,18 @@ struct ecryptfs_filename { struct ecryptfs_crypt_stat { #define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 #define ECRYPTFS_POLICY_APPLIED 0x00000002 -#define ECRYPTFS_NEW_FILE 0x00000004 -#define ECRYPTFS_ENCRYPTED 0x00000008 -#define ECRYPTFS_SECURITY_WARNING 0x00000010 -#define ECRYPTFS_ENABLE_HMAC 0x00000020 -#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 -#define ECRYPTFS_KEY_VALID 0x00000080 -#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 -#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 -#define ECRYPTFS_KEY_SET 0x00000400 -#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 -#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 -#define ECRYPTFS_ENCFN_USE_FEK 0x00002000 -#define ECRYPTFS_UNLINK_SIGS 0x00004000 +#define ECRYPTFS_ENCRYPTED 0x00000004 +#define ECRYPTFS_SECURITY_WARNING 0x00000008 +#define ECRYPTFS_ENABLE_HMAC 0x00000010 +#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000020 +#define ECRYPTFS_KEY_VALID 0x00000040 +#define ECRYPTFS_METADATA_IN_XATTR 0x00000080 +#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000100 +#define ECRYPTFS_KEY_SET 0x00000200 +#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000400 +#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800 +#define ECRYPTFS_ENCFN_USE_FEK 0x00001000 +#define ECRYPTFS_UNLINK_SIGS 0x00002000 u32 flags; unsigned int file_version; size_t iv_bytes; @@ -297,7 +296,6 @@ struct ecryptfs_inode_info { struct inode vfs_inode; struct inode *wii_inode; struct file *lower_file; - struct mutex lower_file_mutex; struct ecryptfs_crypt_stat crypt_stat; }; @@ -333,7 +331,6 @@ struct ecryptfs_global_auth_tok { u32 flags; struct list_head mount_crypt_stat_list; struct key *global_auth_tok_key; - struct ecryptfs_auth_tok *global_auth_tok; unsigned char sig[ECRYPTFS_SIG_SIZE_HEX + 1]; }; @@ -380,7 +377,6 @@ struct ecryptfs_mount_crypt_stat { u32 flags; struct list_head global_auth_tok_list; struct mutex global_auth_tok_list_mutex; - size_t num_global_auth_toks; size_t global_default_cipher_key_size; size_t global_default_fn_cipher_key_bytes; unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 7d1050e254f..cedc913d11b 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -273,7 +273,14 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, int datasync) { - return vfs_fsync(ecryptfs_file_to_lower(file), datasync); + int rc = 0; + + rc = generic_file_fsync(file, datasync); + if (rc) + goto out; + rc = vfs_fsync(ecryptfs_file_to_lower(file), datasync); +out: + return rc; } static int ecryptfs_fasync(int fd, struct file *file, int flag) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index b592938a84b..f99051b7ada 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -143,26 +143,6 @@ out: } /** - * grow_file - * @ecryptfs_dentry: the eCryptfs dentry - * - * This is the code which will grow the file to its correct size. - */ -static int grow_file(struct dentry *ecryptfs_dentry) -{ - struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; - char zero_virt[] = { 0x00 }; - int rc = 0; - - rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1); - i_size_write(ecryptfs_inode, 0); - rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); - ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |= - ECRYPTFS_NEW_FILE; - return rc; -} - -/** * ecryptfs_initialize_file * * Cause the file to be changed from a basic empty file to an ecryptfs @@ -181,7 +161,6 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); goto out; } - crypt_stat->flags |= ECRYPTFS_NEW_FILE; ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); rc = ecryptfs_new_file_context(ecryptfs_dentry); if (rc) { @@ -202,9 +181,6 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); goto out; } - rc = grow_file(ecryptfs_dentry); - if (rc) - printk(KERN_ERR "Error growing file; rc = [%d]\n", rc); out: return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index c1436cff6f2..03e609c4501 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -65,6 +65,24 @@ static int process_request_key_err(long err_code) return rc; } +static int process_find_global_auth_tok_for_sig_err(int err_code) +{ + int rc = err_code; + + switch (err_code) { + case -ENOENT: + ecryptfs_printk(KERN_WARNING, "Missing auth tok\n"); + break; + case -EINVAL: + ecryptfs_printk(KERN_WARNING, "Invalid auth tok\n"); + break; + default: + rc = process_request_key_err(err_code); + break; + } + return rc; +} + /** * ecryptfs_parse_packet_length * @data: Pointer to memory containing length at offset @@ -403,27 +421,120 @@ out: return rc; } +/** + * ecryptfs_verify_version + * @version: The version number to confirm + * + * Returns zero on good version; non-zero otherwise + */ +static int ecryptfs_verify_version(u16 version) +{ + int rc = 0; + unsigned char major; + unsigned char minor; + + major = ((version >> 8) & 0xFF); + minor = (version & 0xFF); + if (major != ECRYPTFS_VERSION_MAJOR) { + ecryptfs_printk(KERN_ERR, "Major version number mismatch. " + "Expected [%d]; got [%d]\n", + ECRYPTFS_VERSION_MAJOR, major); + rc = -EINVAL; + goto out; + } + if (minor != ECRYPTFS_VERSION_MINOR) { + ecryptfs_printk(KERN_ERR, "Minor version number mismatch. " + "Expected [%d]; got [%d]\n", + ECRYPTFS_VERSION_MINOR, minor); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +/** + * ecryptfs_verify_auth_tok_from_key + * @auth_tok_key: key containing the authentication token + * @auth_tok: authentication token + * + * Returns zero on valid auth tok; -EINVAL otherwise + */ +static int +ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key, + struct ecryptfs_auth_tok **auth_tok) +{ + int rc = 0; + + (*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key); + if (ecryptfs_verify_version((*auth_tok)->version)) { + printk(KERN_ERR "Data structure version mismatch. Userspace " + "tools must match eCryptfs kernel module with major " + "version [%d] and minor version [%d]\n", + ECRYPTFS_VERSION_MAJOR, ECRYPTFS_VERSION_MINOR); + rc = -EINVAL; + goto out; + } + if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD + && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) { + printk(KERN_ERR "Invalid auth_tok structure " + "returned from key query\n"); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + static int ecryptfs_find_global_auth_tok_for_sig( - struct ecryptfs_global_auth_tok **global_auth_tok, + struct key **auth_tok_key, + struct ecryptfs_auth_tok **auth_tok, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) { struct ecryptfs_global_auth_tok *walker; int rc = 0; - (*global_auth_tok) = NULL; + (*auth_tok_key) = NULL; + (*auth_tok) = NULL; mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); list_for_each_entry(walker, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { - if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { - rc = key_validate(walker->global_auth_tok_key); - if (!rc) - (*global_auth_tok) = walker; + if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX)) + continue; + + if (walker->flags & ECRYPTFS_AUTH_TOK_INVALID) { + rc = -EINVAL; goto out; } + + rc = key_validate(walker->global_auth_tok_key); + if (rc) { + if (rc == -EKEYEXPIRED) + goto out; + goto out_invalid_auth_tok; + } + + down_write(&(walker->global_auth_tok_key->sem)); + rc = ecryptfs_verify_auth_tok_from_key( + walker->global_auth_tok_key, auth_tok); + if (rc) + goto out_invalid_auth_tok_unlock; + + (*auth_tok_key) = walker->global_auth_tok_key; + key_get(*auth_tok_key); + goto out; } - rc = -EINVAL; + rc = -ENOENT; + goto out; +out_invalid_auth_tok_unlock: + up_write(&(walker->global_auth_tok_key->sem)); +out_invalid_auth_tok: + printk(KERN_WARNING "Invalidating auth tok with sig = [%s]\n", sig); + walker->flags |= ECRYPTFS_AUTH_TOK_INVALID; + key_put(walker->global_auth_tok_key); + walker->global_auth_tok_key = NULL; out: mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); return rc; @@ -451,14 +562,11 @@ ecryptfs_find_auth_tok_for_sig( struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) { - struct ecryptfs_global_auth_tok *global_auth_tok; int rc = 0; - (*auth_tok_key) = NULL; - (*auth_tok) = NULL; - if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, - mount_crypt_stat, sig)) { - + rc = ecryptfs_find_global_auth_tok_for_sig(auth_tok_key, auth_tok, + mount_crypt_stat, sig); + if (rc == -ENOENT) { /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the * mount_crypt_stat structure, we prevent to use auth toks that * are not inserted through the ecryptfs_add_global_auth_tok @@ -470,8 +578,7 @@ ecryptfs_find_auth_tok_for_sig( rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok, sig); - } else - (*auth_tok) = global_auth_tok->global_auth_tok; + } return rc; } @@ -531,6 +638,16 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, } s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; (*packet_size) = 0; + rc = ecryptfs_find_auth_tok_for_sig( + &auth_tok_key, + &s->auth_tok, mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, + mount_crypt_stat->global_default_fnek_sig, rc); + goto out; + } rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( &s->desc.tfm, &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); @@ -616,16 +733,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, goto out_free_unlock; } dest[s->i++] = s->cipher_code; - rc = ecryptfs_find_auth_tok_for_sig( - &auth_tok_key, - &s->auth_tok, mount_crypt_stat, - mount_crypt_stat->global_default_fnek_sig); - if (rc) { - printk(KERN_ERR "%s: Error attempting to find auth tok for " - "fnek sig [%s]; rc = [%d]\n", __func__, - mount_crypt_stat->global_default_fnek_sig, rc); - goto out_free_unlock; - } /* TODO: Support other key modules than passphrase for * filename encryption */ if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { @@ -765,8 +872,10 @@ out_free_unlock: out_unlock: mutex_unlock(s->tfm_mutex); out: - if (auth_tok_key) + if (auth_tok_key) { + up_write(&(auth_tok_key->sem)); key_put(auth_tok_key); + } kfree(s); return rc; } @@ -879,6 +988,15 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, __func__, s->cipher_code); goto out; } + rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, + &s->auth_tok, mount_crypt_stat, + s->fnek_sig_hex); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex, + rc); + goto out; + } rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, &s->tfm_mutex, s->cipher_string); @@ -925,15 +1043,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, * >= ECRYPTFS_MAX_IV_BYTES. */ memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); s->desc.info = s->iv; - rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, - &s->auth_tok, mount_crypt_stat, - s->fnek_sig_hex); - if (rc) { - printk(KERN_ERR "%s: Error attempting to find auth tok for " - "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex, - rc); - goto out_free_unlock; - } /* TODO: Support other key modules than passphrase for * filename encryption */ if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { @@ -1002,8 +1111,10 @@ out: (*filename_size) = 0; (*filename) = NULL; } - if (auth_tok_key) + if (auth_tok_key) { + up_write(&(auth_tok_key->sem)); key_put(auth_tok_key); + } kfree(s); return rc; } @@ -1520,38 +1631,6 @@ out: return rc; } -/** - * ecryptfs_verify_version - * @version: The version number to confirm - * - * Returns zero on good version; non-zero otherwise - */ -static int ecryptfs_verify_version(u16 version) -{ - int rc = 0; - unsigned char major; - unsigned char minor; - - major = ((version >> 8) & 0xFF); - minor = (version & 0xFF); - if (major != ECRYPTFS_VERSION_MAJOR) { - ecryptfs_printk(KERN_ERR, "Major version number mismatch. " - "Expected [%d]; got [%d]\n", - ECRYPTFS_VERSION_MAJOR, major); - rc = -EINVAL; - goto out; - } - if (minor != ECRYPTFS_VERSION_MINOR) { - ecryptfs_printk(KERN_ERR, "Minor version number mismatch. " - "Expected [%d]; got [%d]\n", - ECRYPTFS_VERSION_MINOR, minor); - rc = -EINVAL; - goto out; - } -out: - return rc; -} - int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, struct ecryptfs_auth_tok **auth_tok, char *sig) @@ -1563,31 +1642,16 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, printk(KERN_ERR "Could not find key with description: [%s]\n", sig); rc = process_request_key_err(PTR_ERR(*auth_tok_key)); + (*auth_tok_key) = NULL; goto out; } - (*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key); - if (ecryptfs_verify_version((*auth_tok)->version)) { - printk(KERN_ERR - "Data structure version mismatch. " - "Userspace tools must match eCryptfs " - "kernel module with major version [%d] " - "and minor version [%d]\n", - ECRYPTFS_VERSION_MAJOR, - ECRYPTFS_VERSION_MINOR); - rc = -EINVAL; - goto out_release_key; - } - if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD - && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) { - printk(KERN_ERR "Invalid auth_tok structure " - "returned from key query\n"); - rc = -EINVAL; - goto out_release_key; - } -out_release_key: + down_write(&(*auth_tok_key)->sem); + rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok); if (rc) { + up_write(&(*auth_tok_key)->sem); key_put(*auth_tok_key); (*auth_tok_key) = NULL; + goto out; } out: return rc; @@ -1809,6 +1873,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, find_next_matching_auth_tok: found_auth_tok = 0; if (auth_tok_key) { + up_write(&(auth_tok_key->sem)); key_put(auth_tok_key); auth_tok_key = NULL; } @@ -1895,8 +1960,10 @@ found_matching_auth_tok: out_wipe_list: wipe_auth_tok_list(&auth_tok_list); out: - if (auth_tok_key) + if (auth_tok_key) { + up_write(&(auth_tok_key->sem)); key_put(auth_tok_key); + } return rc; } @@ -2324,7 +2391,7 @@ ecryptfs_generate_key_packet_set(char *dest_base, size_t max) { struct ecryptfs_auth_tok *auth_tok; - struct ecryptfs_global_auth_tok *global_auth_tok; + struct key *auth_tok_key = NULL; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; @@ -2343,21 +2410,16 @@ ecryptfs_generate_key_packet_set(char *dest_base, list_for_each_entry(key_sig, &crypt_stat->keysig_list, crypt_stat_list) { memset(key_rec, 0, sizeof(*key_rec)); - rc = ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, + rc = ecryptfs_find_global_auth_tok_for_sig(&auth_tok_key, + &auth_tok, mount_crypt_stat, key_sig->keysig); if (rc) { - printk(KERN_ERR "Error attempting to get the global " - "auth_tok; rc = [%d]\n", rc); + printk(KERN_WARNING "Unable to retrieve auth tok with " + "sig = [%s]\n", key_sig->keysig); + rc = process_find_global_auth_tok_for_sig_err(rc); goto out_free; } - if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID) { - printk(KERN_WARNING - "Skipping invalid auth tok with sig = [%s]\n", - global_auth_tok->sig); - continue; - } - auth_tok = global_auth_tok->global_auth_tok; if (auth_tok->token_type == ECRYPTFS_PASSWORD) { rc = write_tag_3_packet((dest_base + (*len)), &max, auth_tok, @@ -2395,6 +2457,9 @@ ecryptfs_generate_key_packet_set(char *dest_base, rc = -EINVAL; goto out_free; } + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + auth_tok_key = NULL; } if (likely(max > 0)) { dest_base[(*len)] = 0x00; @@ -2407,6 +2472,11 @@ out_free: out: if (rc) (*len) = 0; + if (auth_tok_key) { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + } + mutex_unlock(&crypt_stat->keysig_list_mutex); return rc; } @@ -2424,6 +2494,7 @@ int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig) return -ENOMEM; } memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); + new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; /* Caller must hold keysig_list_mutex */ list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list); @@ -2453,7 +2524,6 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); list_add(&new_auth_tok->mount_crypt_stat_list, &mount_crypt_stat->global_auth_tok_list); - mount_crypt_stat->num_global_auth_toks++; mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); out: return rc; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 758323a0f09..c27c0ecf90b 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -122,7 +122,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); int rc = 0; - mutex_lock(&inode_info->lower_file_mutex); if (!inode_info->lower_file) { struct dentry *lower_dentry; struct vfsmount *lower_mnt = @@ -138,7 +137,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) inode_info->lower_file = NULL; } } - mutex_unlock(&inode_info->lower_file_mutex); return rc; } @@ -241,14 +239,14 @@ static int ecryptfs_init_global_auth_toks( struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { struct ecryptfs_global_auth_tok *global_auth_tok; + struct ecryptfs_auth_tok *auth_tok; int rc = 0; list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { rc = ecryptfs_keyring_auth_tok_for_sig( - &global_auth_tok->global_auth_tok_key, - &global_auth_tok->global_auth_tok, + &global_auth_tok->global_auth_tok_key, &auth_tok, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Could not find valid key in user " @@ -256,8 +254,10 @@ static int ecryptfs_init_global_auth_toks( "option: [%s]\n", global_auth_tok->sig); global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; goto out; - } else + } else { global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; + up_write(&(global_auth_tok->global_auth_tok_key)->sem); + } } out: return rc; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index cc64fca89f8..6a44148c5fb 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -62,6 +62,18 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) { int rc; + /* + * Refuse to write the page out if we are called from reclaim context + * since our writepage() path may potentially allocate memory when + * calling into the lower fs vfs_write() which may in turn invoke + * us again. + */ + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + rc = 0; + goto out; + } + rc = ecryptfs_encrypt_page(page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting " @@ -70,8 +82,8 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) goto out; } SetPageUptodate(page); - unlock_page(page); out: + unlock_page(page); return rc; } @@ -193,11 +205,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page) &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat; int rc = 0; - if (!crypt_stat - || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED) - || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { - ecryptfs_printk(KERN_DEBUG, - "Passing through unencrypted page\n"); + if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_read_lower_page_segment(page, page->index, 0, PAGE_CACHE_SIZE, page->mapping->host); @@ -295,8 +303,7 @@ static int ecryptfs_write_begin(struct file *file, struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(mapping->host)->crypt_stat; - if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) - || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_read_lower_page_segment( page, index, 0, PAGE_CACHE_SIZE, mapping->host); if (rc) { @@ -374,6 +381,11 @@ static int ecryptfs_write_begin(struct file *file, && (pos != 0)) zero_user(page, 0, PAGE_CACHE_SIZE); out: + if (unlikely(rc)) { + unlock_page(page); + page_cache_release(page); + *pagep = NULL; + } return rc; } @@ -486,13 +498,8 @@ static int ecryptfs_write_end(struct file *file, struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc; + int need_unlock_page = 1; - if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { - ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in " - "crypt_stat at memory location [%p]\n", crypt_stat); - crypt_stat->flags &= ~(ECRYPTFS_NEW_FILE); - } else - ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { @@ -512,26 +519,26 @@ static int ecryptfs_write_end(struct file *file, "zeros in page with index = [0x%.16lx]\n", index); goto out; } - rc = ecryptfs_encrypt_page(page); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " - "index [0x%.16lx])\n", index); - goto out; - } + set_page_dirty(page); + unlock_page(page); + need_unlock_page = 0; if (pos + copied > i_size_read(ecryptfs_inode)) { i_size_write(ecryptfs_inode, pos + copied); ecryptfs_printk(KERN_DEBUG, "Expanded file size to " "[0x%.16llx]\n", (unsigned long long)i_size_read(ecryptfs_inode)); + balance_dirty_pages_ratelimited(mapping); + rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); + if (rc) { + printk(KERN_ERR "Error writing inode size to metadata; " + "rc = [%d]\n", rc); + goto out; + } } - rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); - if (rc) - printk(KERN_ERR "Error writing inode size to metadata; " - "rc = [%d]\n", rc); - else - rc = copied; + rc = copied; out: - unlock_page(page); + if (need_unlock_page) + unlock_page(page); page_cache_release(page); return rc; } diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index db184ef15d3..85d43096311 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -44,15 +44,11 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, ssize_t rc; inode_info = ecryptfs_inode_to_private(ecryptfs_inode); - mutex_lock(&inode_info->lower_file_mutex); BUG_ON(!inode_info->lower_file); - inode_info->lower_file->f_pos = offset; fs_save = get_fs(); set_fs(get_ds()); - rc = vfs_write(inode_info->lower_file, data, size, - &inode_info->lower_file->f_pos); + rc = vfs_write(inode_info->lower_file, data, size, &offset); set_fs(fs_save); - mutex_unlock(&inode_info->lower_file_mutex); mark_inode_dirty_sync(ecryptfs_inode); return rc; } @@ -234,15 +230,11 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, mm_segment_t fs_save; ssize_t rc; - mutex_lock(&inode_info->lower_file_mutex); BUG_ON(!inode_info->lower_file); - inode_info->lower_file->f_pos = offset; fs_save = get_fs(); set_fs(get_ds()); - rc = vfs_read(inode_info->lower_file, data, size, - &inode_info->lower_file->f_pos); + rc = vfs_read(inode_info->lower_file, data, size, &offset); set_fs(fs_save); - mutex_unlock(&inode_info->lower_file_mutex); return rc; } diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 3042fe123a3..bacc882e1ae 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -55,7 +55,6 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) if (unlikely(!inode_info)) goto out; ecryptfs_init_crypt_stat(&inode_info->crypt_stat); - mutex_init(&inode_info->lower_file_mutex); inode_info->lower_file = NULL; inode = &inode_info->vfs_inode; out: @@ -198,7 +197,7 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) const struct super_operations ecryptfs_sops = { .alloc_inode = ecryptfs_alloc_inode, .destroy_inode = ecryptfs_destroy_inode, - .drop_inode = generic_delete_inode, + .drop_inode = generic_drop_inode, .statfs = ecryptfs_statfs, .remount_fs = NULL, .evict_inode = ecryptfs_evict_inode, diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index adf96b82278..97b970e7dd1 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -21,6 +21,8 @@ #include "ext4_jbd2.h" #include "mballoc.h" +#include <trace/events/ext4.h> + /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -342,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) * We do it here so the bitmap uptodate bit * get set with buffer lock held. */ + trace_ext4_read_block_bitmap_load(sb, block_group); set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index d8b992e658c..e25e99bf7ee 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -202,13 +202,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) return 1; } -static inline void ext4_journal_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_release_buffer(handle, bh); -} - static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) { return ext4_journal_start_sb(inode->i_sb, nblocks); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7516fb9c0bd..dd2cb5076ff 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,6 +44,8 @@ #include "ext4_jbd2.h" #include "ext4_extents.h" +#include <trace/events/ext4.h> + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -664,6 +666,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, if (unlikely(!bh)) goto err; if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, block, + path[ppos].p_block); if (bh_submit_read(bh) < 0) { put_bh(bh); goto err; @@ -1034,7 +1038,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext4_free_blocks(handle, inode, 0, ablocks[i], 1, + ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, EXT4_FREE_BLOCKS_METADATA); } } @@ -2059,7 +2063,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); - ext4_free_blocks(handle, inode, 0, leaf, 1, + ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return err; } @@ -2156,7 +2160,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, num = le32_to_cpu(ex->ee_block) + ee_len - from; start = ext4_ext_pblock(ex) + ee_len - num; ext_debug("free last %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, 0, start, num, flags); + ext4_free_blocks(handle, inode, NULL, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", @@ -3108,14 +3112,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, { int i, depth; struct ext4_extent_header *eh; - struct ext4_extent *ex, *last_ex; + struct ext4_extent *last_ex; if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) return 0; depth = ext_depth(inode); eh = path[depth].p_hdr; - ex = path[depth].p_ext; if (unlikely(!eh->eh_entries)) { EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " @@ -3295,9 +3298,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; - struct ext4_extent_header *eh; struct ext4_extent newex, *ex; - ext4_fsblk_t newblock; + ext4_fsblk_t newblock = 0; int err = 0, depth, ret; unsigned int allocated = 0; struct ext4_allocation_request ar; @@ -3305,6 +3307,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); + trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { @@ -3352,7 +3355,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, err = -EIO; goto out2; } - eh = path[depth].p_hdr; ex = path[depth].p_ext; if (ex) { @@ -3485,7 +3487,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex), + ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), ext4_ext_get_actual_len(&newex), 0); goto out2; } @@ -3525,6 +3527,8 @@ out2: ext4_ext_drop_refs(path); kfree(path); } + trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, + newblock, map->m_len, err ? err : allocated); return err ? err : allocated; } @@ -3658,6 +3662,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because @@ -3673,6 +3678,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ret = inode_newsize_ok(inode, (len + offset)); if (ret) { mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } retry: @@ -3717,6 +3723,8 @@ retry: goto retry; } mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, + ret > 0 ? ret2 : ret); return ret > 0 ? ret2 : ret; } @@ -3775,6 +3783,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, } return ret > 0 ? ret2 : ret; } + /* * Callback function called for each extent to gather FIEMAP information. */ @@ -3782,38 +3791,162 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { - struct fiemap_extent_info *fieinfo = data; - unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; __u64 logical; __u64 physical; __u64 length; + loff_t size; __u32 flags = 0; - int error; + int ret = 0; + struct fiemap_extent_info *fieinfo = data; + unsigned char blksize_bits; - logical = (__u64)newex->ec_block << blksize_bits; + blksize_bits = inode->i_sb->s_blocksize_bits; + logical = (__u64)newex->ec_block << blksize_bits; if (newex->ec_start == 0) { - pgoff_t offset; - struct page *page; + /* + * No extent in extent-tree contains block @newex->ec_start, + * then the block may stay in 1)a hole or 2)delayed-extent. + * + * Holes or delayed-extents are processed as follows. + * 1. lookup dirty pages with specified range in pagecache. + * If no page is got, then there is no delayed-extent and + * return with EXT_CONTINUE. + * 2. find the 1st mapped buffer, + * 3. check if the mapped buffer is both in the request range + * and a delayed buffer. If not, there is no delayed-extent, + * then return. + * 4. a delayed-extent is found, the extent will be collected. + */ + ext4_lblk_t end = 0; + pgoff_t last_offset; + pgoff_t offset; + pgoff_t index; + struct page **pages = NULL; struct buffer_head *bh = NULL; + struct buffer_head *head = NULL; + unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); + + pages = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; offset = logical >> PAGE_SHIFT; - page = find_get_page(inode->i_mapping, offset); - if (!page || !page_has_buffers(page)) - return EXT_CONTINUE; +repeat: + last_offset = offset; + head = NULL; + ret = find_get_pages_tag(inode->i_mapping, &offset, + PAGECACHE_TAG_DIRTY, nr_pages, pages); + + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* First time, try to find a mapped buffer. */ + if (ret == 0) { +out: + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + /* just a hole. */ + kfree(pages); + return EXT_CONTINUE; + } - bh = page_buffers(page); + /* Try to find the 1st mapped buffer. */ + end = ((__u64)pages[0]->index << PAGE_SHIFT) >> + blksize_bits; + if (!page_has_buffers(pages[0])) + goto out; + head = page_buffers(pages[0]); + if (!head) + goto out; - if (!bh) - return EXT_CONTINUE; + bh = head; + do { + if (buffer_mapped(bh)) { + /* get the 1st mapped buffer. */ + if (end > newex->ec_block + + newex->ec_len) + /* The buffer is out of + * the request range. + */ + goto out; + goto found_mapped_buffer; + } + bh = bh->b_this_page; + end++; + } while (bh != head); - if (buffer_delay(bh)) { - flags |= FIEMAP_EXTENT_DELALLOC; - page_cache_release(page); + /* No mapped buffer found. */ + goto out; } else { - page_cache_release(page); - return EXT_CONTINUE; + /*Find contiguous delayed buffers. */ + if (ret > 0 && pages[0]->index == last_offset) + head = page_buffers(pages[0]); + bh = head; } + +found_mapped_buffer: + if (bh != NULL && buffer_delay(bh)) { + /* 1st or contiguous delayed buffer found. */ + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* + * 1st delayed buffer found, record + * the start of extent. + */ + flags |= FIEMAP_EXTENT_DELALLOC; + newex->ec_block = end; + logical = (__u64)end << blksize_bits; + } + /* Find contiguous delayed buffers. */ + do { + if (!buffer_delay(bh)) + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + + for (index = 1; index < ret; index++) { + if (!page_has_buffers(pages[index])) { + bh = NULL; + break; + } + head = page_buffers(pages[index]); + if (!head) { + bh = NULL; + break; + } + if (pages[index]->index != + pages[0]->index + index) { + /* Blocks are not contiguous. */ + bh = NULL; + break; + } + bh = head; + do { + if (!buffer_delay(bh)) + /* Delayed-extent ends. */ + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + } + } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) + /* a hole found. */ + goto out; + +found_delayed_extent: + newex->ec_len = min(end - newex->ec_block, + (ext4_lblk_t)EXT_INIT_MAX_LEN); + if (ret == nr_pages && bh != NULL && + newex->ec_len < EXT_INIT_MAX_LEN && + buffer_delay(bh)) { + /* Have not collected an extent and continue. */ + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + goto repeat; + } + + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + kfree(pages); } physical = (__u64)newex->ec_start << blksize_bits; @@ -3822,32 +3955,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, if (ex && ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; - /* - * If this extent reaches EXT_MAX_BLOCK, it must be last. - * - * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, - * this also indicates no more allocated blocks. - * - * XXX this might miss a single-block extent at EXT_MAX_BLOCK - */ - if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || - newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) { - loff_t size = i_size_read(inode); - loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb); - + size = i_size_read(inode); + if (logical + length >= size) flags |= FIEMAP_EXTENT_LAST; - if ((flags & FIEMAP_EXTENT_DELALLOC) && - logical+length > size) - length = (size - logical + bs - 1) & ~(bs-1); - } - error = fiemap_fill_next_extent(fieinfo, logical, physical, + ret = fiemap_fill_next_extent(fieinfo, logical, physical, length, flags); - if (error < 0) - return error; - if (error == 1) + if (ret < 0) + return ret; + if (ret == 1) return EXT_BREAK; - return EXT_CONTINUE; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 7829b287822..7f74019d6d7 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -164,20 +164,20 @@ int ext4_sync_file(struct file *file, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); - trace_ext4_sync_file(file, datasync); + trace_ext4_sync_file_enter(file, datasync); if (inode->i_sb->s_flags & MS_RDONLY) return 0; ret = ext4_flush_completed_IO(inode); if (ret < 0) - return ret; + goto out; if (!journal) { ret = generic_file_fsync(file, datasync); if (!ret && !list_empty(&inode->i_dentry)) ext4_sync_parent(inode); - return ret; + goto out; } /* @@ -194,8 +194,10 @@ int ext4_sync_file(struct file *file, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) - return ext4_force_commit(inode->i_sb); + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + goto out; + } commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; if (jbd2_log_start_commit(journal, commit_tid)) { @@ -215,5 +217,7 @@ int ext4_sync_file(struct file *file, int datasync) ret = jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + out: + trace_ext4_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 78b79e1bd7e..21bb2f61e50 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -152,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) * We do it here so the bitmap uptodate bit * get set with buffer lock held. */ + trace_ext4_load_inode_bitmap(sb, block_group); set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); @@ -649,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group + flex_size; if (*group > ngroups) *group = 0; - return find_group_orlov(sb, parent, group, mode, 0); + return find_group_orlov(sb, parent, group, mode, NULL); } /* @@ -1054,6 +1055,11 @@ got: } } + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + ei->i_datasync_tid = handle->h_transaction->t_tid; + } + err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9297ad46c46..1a86282b902 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -173,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, BUG_ON(EXT4_JOURNAL(inode) == NULL); jbd_debug(2, "restarting handle %p\n", handle); up_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); + ret = ext4_journal_restart(handle, nblocks); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -720,7 +720,7 @@ allocated: return ret; failed_out: for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); return ret; } @@ -823,20 +823,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return err; failed: /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); for (i = 1; i <= n ; i++) { /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ - ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, EXT4_FREE_BLOCKS_FORGET); } for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); return err; } @@ -924,7 +924,7 @@ err_out: ext4_free_blocks(handle, inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), blks, 0); return err; @@ -973,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, int count = 0; ext4_fsblk_t first_block = 0; + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, map->m_lblk, offsets, @@ -1058,6 +1059,8 @@ cleanup: partial--; } out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); return err; } @@ -2060,7 +2063,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { - int commit_write = 0, redirty_page = 0; + int commit_write = 0, skip_page = 0; struct page *page = pvec.pages[i]; index = page->index; @@ -2086,14 +2089,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, * If the page does not have buffers (for * whatever reason), try to create them using * __block_write_begin. If this fails, - * redirty the page and move on. + * skip the page and move on. */ if (!page_has_buffers(page)) { if (__block_write_begin(page, 0, len, noalloc_get_block_write)) { - redirty_page: - redirty_page_for_writepage(mpd->wbc, - page); + skip_page: unlock_page(page); continue; } @@ -2104,7 +2105,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, block_start = 0; do { if (!bh) - goto redirty_page; + goto skip_page; if (map && (cur_logical >= map->m_lblk) && (cur_logical <= (map->m_lblk + (map->m_len - 1)))) { @@ -2120,22 +2121,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_unwritten(bh); } - /* redirty page if block allocation undone */ + /* skip page if block allocation undone */ if (buffer_delay(bh) || buffer_unwritten(bh)) - redirty_page = 1; + skip_page = 1; bh = bh->b_this_page; block_start += bh->b_size; cur_logical++; pblock++; } while (bh != page_bufs); - if (redirty_page) - goto redirty_page; + if (skip_page) + goto skip_page; if (commit_write) /* mark the buffer_heads as dirty & uptodate */ block_commit_write(page, 0, len); + clear_page_dirty_for_io(page); /* * Delalloc doesn't support data journalling, * but eventually maybe we'll lift this @@ -2165,8 +2167,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, return ret; } -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, - sector_t logical, long blk_cnt) +static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) { int nr_pages, i; pgoff_t index, end; @@ -2174,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (logical + blk_cnt - 1) >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); + index = mpd->first_page; + end = mpd->next_page - 1; while (index <= end) { nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) @@ -2279,9 +2279,8 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) err = blks; /* * If get block returns EAGAIN or ENOSPC and there - * appears to be free blocks we will call - * ext4_writepage() for all of the pages which will - * just redirty the pages. + * appears to be free blocks we will just let + * mpage_da_submit_io() unlock all of the pages. */ if (err == -EAGAIN) goto submit_io; @@ -2312,8 +2311,10 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) ext4_print_free_blocks(mpd->inode); } /* invalidate all the pages */ - ext4_da_block_invalidatepages(mpd, next, - mpd->b_size >> mpd->inode->i_blkbits); + ext4_da_block_invalidatepages(mpd); + + /* Mark this page range as having been completed */ + mpd->io_done = 1; return; } BUG_ON(blks == 0); @@ -2438,102 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) } /* - * __mpage_da_writepage - finds extent of pages and blocks - * - * @page: page to consider - * @wbc: not used, we just follow rules - * @data: context - * - * The function finds extents of pages and scan them for all blocks. - */ -static int __mpage_da_writepage(struct page *page, - struct writeback_control *wbc, - struct mpage_da_data *mpd) -{ - struct inode *inode = mpd->inode; - struct buffer_head *bh, *head; - sector_t logical; - - /* - * Can we merge this page to current extent? - */ - if (mpd->next_page != page->index) { - /* - * Nope, we can't. So, we map non-allocated blocks - * and start IO on them - */ - if (mpd->next_page != mpd->first_page) { - mpage_da_map_and_submit(mpd); - /* - * skip rest of the page in the page_vec - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return MPAGE_DA_EXTENT_TAIL; - } - - /* - * Start next extent of pages ... - */ - mpd->first_page = page->index; - - /* - * ... and blocks - */ - mpd->b_size = 0; - mpd->b_state = 0; - mpd->b_blocknr = 0; - } - - mpd->next_page = page->index + 1; - logical = (sector_t) page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - if (!page_has_buffers(page)) { - mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE, - (1 << BH_Dirty) | (1 << BH_Uptodate)); - if (mpd->io_done) - return MPAGE_DA_EXTENT_TAIL; - } else { - /* - * Page with regular buffer heads, just add all dirty ones - */ - head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); - /* - * We need to try to allocate - * unmapped blocks in the same page. - * Otherwise we won't make progress - * with the page in ext4_writepage - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_size, - bh->b_state); - if (mpd->io_done) - return MPAGE_DA_EXTENT_TAIL; - } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { - /* - * mapped dirty buffer. We need to update - * the b_state because we look at - * b_state in mpage_da_map_blocks. We don't - * update b_size because if we find an - * unmapped buffer_head later we need to - * use the b_state flag of that buffer_head. - */ - if (mpd->b_size == 0) - mpd->b_state = bh->b_state & BH_FLAGS; - } - logical++; - } while ((bh = bh->b_this_page) != head); - } - - return 0; -} - -/* * This is a special get_blocks_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. @@ -2597,7 +2502,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * for partial write. */ set_buffer_new(bh); - set_buffer_mapped(bh); } return 0; } @@ -2811,27 +2715,27 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) /* * write_cache_pages_da - walk the list of dirty pages of the given - * address space and call the callback function (which usually writes - * the pages). - * - * This is a forked version of write_cache_pages(). Differences: - * Range cyclic is ignored. - * no_nrwrite_index_update is always presumed true + * address space and accumulate pages that need writing, and call + * mpage_da_map_and_submit to map a single contiguous memory region + * and then write them. */ static int write_cache_pages_da(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd, pgoff_t *done_index) { - int ret = 0; - int done = 0; - struct pagevec pvec; - unsigned nr_pages; - pgoff_t index; - pgoff_t end; /* Inclusive */ - long nr_to_write = wbc->nr_to_write; - int tag; - + struct buffer_head *bh, *head; + struct inode *inode = mapping->host; + struct pagevec pvec; + unsigned int nr_pages; + sector_t logical; + pgoff_t index, end; + long nr_to_write = wbc->nr_to_write; + int i, tag, ret = 0; + + memset(mpd, 0, sizeof(struct mpage_da_data)); + mpd->wbc = wbc; + mpd->inode = inode; pagevec_init(&pvec, 0); index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; @@ -2842,13 +2746,11 @@ static int write_cache_pages_da(struct address_space *mapping, tag = PAGECACHE_TAG_DIRTY; *done_index = index; - while (!done && (index <= end)) { - int i; - + while (index <= end) { nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) - break; + return 0; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -2860,60 +2762,100 @@ static int write_cache_pages_da(struct address_space *mapping, * mapping. However, page->index will not change * because we have a reference on the page. */ - if (page->index > end) { - done = 1; - break; - } + if (page->index > end) + goto out; *done_index = page->index + 1; + /* + * If we can't merge this page, and we have + * accumulated an contiguous region, write it + */ + if ((mpd->next_page != page->index) && + (mpd->next_page != mpd->first_page)) { + mpage_da_map_and_submit(mpd); + goto ret_extent_tail; + } + lock_page(page); /* - * Page truncated or invalidated. We can freely skip it - * then, even for data integrity operations: the page - * has disappeared concurrently, so there could be no - * real expectation of this data interity operation - * even if there is now a new, dirty page at the same - * pagecache address. + * If the page is no longer dirty, or its + * mapping no longer corresponds to inode we + * are writing (which means it has been + * truncated or invalidated), or the page is + * already under writeback and we are not + * doing a data integrity writeback, skip the page */ - if (unlikely(page->mapping != mapping)) { -continue_unlock: + if (!PageDirty(page) || + (PageWriteback(page) && + (wbc->sync_mode == WB_SYNC_NONE)) || + unlikely(page->mapping != mapping)) { unlock_page(page); continue; } - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } - - if (PageWriteback(page)) { - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - else - goto continue_unlock; - } + if (PageWriteback(page)) + wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) - goto continue_unlock; - ret = __mpage_da_writepage(page, wbc, mpd); - if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - done = 1; - break; - } + if (mpd->next_page != page->index) + mpd->first_page = page->index; + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + mpage_add_bh_to_extent(mpd, logical, + PAGE_CACHE_SIZE, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + if (mpd->io_done) + goto ret_extent_tail; + } else { + /* + * Page with regular buffer heads, + * just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + /* + * We need to try to allocate + * unmapped blocks in the same page. + * Otherwise we won't make progress + * with the page in ext4_writepage + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) { + mpage_add_bh_to_extent(mpd, logical, + bh->b_size, + bh->b_state); + if (mpd->io_done) + goto ret_extent_tail; + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { + /* + * mapped dirty buffer. We need + * to update the b_state + * because we look at b_state + * in mpage_da_map_blocks. We + * don't update b_size because + * if we find an unmapped + * buffer_head later we need to + * use the b_state flag of that + * buffer_head. + */ + if (mpd->b_size == 0) + mpd->b_state = bh->b_state & BH_FLAGS; + } + logical++; + } while ((bh = bh->b_this_page) != head); } if (nr_to_write > 0) { nr_to_write--; if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) { + wbc->sync_mode == WB_SYNC_NONE) /* * We stop writing back only if we are * not doing integrity sync. In case of @@ -2924,14 +2866,18 @@ continue_unlock: * pages, but have not synced all of the * old dirty pages. */ - done = 1; - break; - } + goto out; } } pagevec_release(&pvec); cond_resched(); } + return 0; +ret_extent_tail: + ret = MPAGE_DA_EXTENT_TAIL; +out: + pagevec_release(&pvec); + cond_resched(); return ret; } @@ -2945,7 +2891,6 @@ static int ext4_da_writepages(struct address_space *mapping, struct mpage_da_data mpd; struct inode *inode = mapping->host; int pages_written = 0; - long pages_skipped; unsigned int max_pages; int range_cyclic, cycled = 1, io_done = 0; int needed_blocks, ret = 0; @@ -3028,11 +2973,6 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->nr_to_write = desired_nr_to_write; } - mpd.wbc = wbc; - mpd.inode = mapping->host; - - pages_skipped = wbc->pages_skipped; - retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); @@ -3059,22 +2999,10 @@ retry: } /* - * Now call __mpage_da_writepage to find the next + * Now call write_cache_pages_da() to find the next * contiguous region of logical blocks that need - * blocks to be allocated by ext4. We don't actually - * submit the blocks for I/O here, even though - * write_cache_pages thinks it will, and will set the - * pages as clean for write before calling - * __mpage_da_writepage(). + * blocks to be allocated by ext4 and submit them. */ - mpd.b_size = 0; - mpd.b_state = 0; - mpd.b_blocknr = 0; - mpd.first_page = 0; - mpd.next_page = 0; - mpd.io_done = 0; - mpd.pages_written = 0; - mpd.retval = 0; ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); /* * If we have a contiguous extent of pages and we @@ -3096,7 +3024,6 @@ retry: * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); - wbc->pages_skipped = pages_skipped; ret = 0; } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* @@ -3104,7 +3031,6 @@ retry: * rest of the pages */ pages_written += mpd.pages_written; - wbc->pages_skipped = pages_skipped; ret = 0; io_done = 1; } else if (wbc->nr_to_write) @@ -3122,11 +3048,6 @@ retry: wbc->range_end = mapping->writeback_index - 1; goto retry; } - if (pages_skipped != wbc->pages_skipped) - ext4_msg(inode->i_sb, KERN_CRIT, - "This should not happen leaving %s " - "with nr_to_write = %ld ret = %d", - __func__, wbc->nr_to_write, ret); /* Update index */ wbc->range_cyclic = range_cyclic; @@ -3460,6 +3381,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) static int ext4_readpage(struct file *file, struct page *page) { + trace_ext4_readpage(page); return mpage_readpage(page, ext4_get_block); } @@ -3494,6 +3416,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); + trace_ext4_invalidatepage(page, offset); + /* * free any io_end structure allocated for buffers to be discarded */ @@ -3515,6 +3439,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); + trace_ext4_releasepage(page); + WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -3873,11 +3799,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + ssize_t ret; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); - - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + else + ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + trace_ext4_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); + return ret; } /* @@ -4173,6 +4104,9 @@ no_top: * * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. */ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, @@ -4199,33 +4133,32 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) { - ext4_std_error(inode->i_sb, err); - return 1; - } + if (unlikely(err)) + goto out_err; } err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) { - ext4_std_error(inode->i_sb, err); - return 1; - } + if (unlikely(err)) + goto out_err; err = ext4_truncate_restart_trans(handle, inode, blocks_for_truncate(inode)); - if (unlikely(err)) { - ext4_std_error(inode->i_sb, err); - return 1; - } + if (unlikely(err)) + goto out_err; if (bh) { BUFFER_TRACE(bh, "retaking write access"); - ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; } } for (p = first; p < last; p++) *p = 0; - ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; } /** @@ -4259,7 +4192,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, ext4_fsblk_t nr; /* Current block # */ __le32 *p; /* Pointer into inode/ind for current block */ - int err; + int err = 0; if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); @@ -4281,9 +4214,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, } else if (nr == block_to_free + count) { count++; } else { - if (ext4_clear_blocks(handle, inode, this_bh, - block_to_free, count, - block_to_free_p, p)) + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) break; block_to_free = nr; block_to_free_p = p; @@ -4292,9 +4226,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, } } - if (count > 0) - ext4_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; if (this_bh) { BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); @@ -4412,7 +4349,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * transaction where the data blocks are * actually freed. */ - ext4_free_blocks(handle, inode, 0, nr, 1, + ext4_free_blocks(handle, inode, NULL, nr, 1, EXT4_FREE_BLOCKS_METADATA| EXT4_FREE_BLOCKS_FORGET); @@ -4496,6 +4433,8 @@ void ext4_truncate(struct inode *inode) ext4_lblk_t last_block; unsigned blocksize = inode->i_sb->s_blocksize; + trace_ext4_truncate_enter(inode); + if (!ext4_can_truncate(inode)) return; @@ -4506,6 +4445,7 @@ void ext4_truncate(struct inode *inode) if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ext4_ext_truncate(inode); + trace_ext4_truncate_exit(inode); return; } @@ -4635,6 +4575,7 @@ out_stop: ext4_orphan_del(handle, inode); ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); } /* @@ -4766,6 +4707,7 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext4_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ_META, bh); @@ -4871,7 +4813,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); - iloc.bh = 0; + iloc.bh = NULL; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a84faa110bc..808c554e773 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -334,16 +334,22 @@ mext_out: case FITRIM: { struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(&range, (struct fstrim_range *)arg, sizeof(range))) return -EFAULT; + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; @@ -421,6 +427,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } case EXT4_IOC_MOVE_EXT: + case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d1fe09aea73..a5837a837a8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -432,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) } /* at order 0 we see each particular block */ - *max = 1 << (e4b->bd_blkbits + 3); - if (order == 0) + if (order == 0) { + *max = 1 << (e4b->bd_blkbits + 3); return EXT4_MB_BITMAP(e4b); + } bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; @@ -616,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); - buddy = mb_find_buddy(e4b, 0, &max); list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -635,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, #define mb_check_buddy(e4b) #endif -/* FIXME!! need more doc */ +/* + * Divide blocks started from @first with length @len into + * smaller chunks with power of 2 blocks. + * Clear the bits in bitmap which the blocks of the chunk(s) covered, + * then increase bb_counters[] for corresponded chunk size. + */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) @@ -2381,7 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb) /* An 8TB filesystem with 64-bit pointers requires a 4096 byte * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. * So a two level scheme suffices for now. */ - sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); + sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); return -ENOMEM; @@ -3208,7 +3213,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, cur_distance = abs(goal_block - cpa->pa_pstart); new_distance = abs(goal_block - pa->pa_pstart); - if (cur_distance < new_distance) + if (cur_distance <= new_distance) return cpa; /* drop the previous reference */ @@ -3907,7 +3912,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; - if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (!mb_enable_debug || + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; printk(KERN_ERR "EXT4-fs: Can't allocate:" @@ -4753,7 +4759,8 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, * bitmap. Then issue a TRIM command on this extent and free the extent in * the group buddy bitmap. This is done until whole group is scanned. */ -ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { void *bitmap; @@ -4863,10 +4870,15 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) break; } - if (len >= EXT4_BLOCKS_PER_GROUP(sb)) - len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); - else + /* + * For all the groups except the last one, last block will + * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to + * change it for the last group in which case start + + * len < EXT4_BLOCKS_PER_GROUP(sb). + */ + if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb)) last_block = first_block + len; + len -= last_block - first_block; if (e4b.bd_info->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, &e4b, first_block, diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b619322c76f..22bd4d7f289 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -169,7 +169,7 @@ struct ext4_allocation_context { /* original request */ struct ext4_free_extent ac_o_ex; - /* goal request (after normalization) */ + /* goal request (normalized ac_o_ex) */ struct ext4_free_extent ac_g_ex; /* the best found extent */ diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b0a126f23c2..d1bafa57f48 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -263,7 +263,7 @@ static int free_dind_blocks(handle_t *handle, for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -271,7 +271,7 @@ static int free_dind_blocks(handle_t *handle, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; @@ -302,7 +302,7 @@ static int free_tind_blocks(handle_t *handle, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; @@ -315,7 +315,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -428,7 +428,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, block, 1, + ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return retval; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e781b7ea563..67fd0b02585 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -40,6 +40,7 @@ #include "xattr.h" #include "acl.h" +#include <trace/events/ext4.h> /* * define how far ahead to read directories while searching them. */ @@ -2183,6 +2184,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle; + trace_ext4_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); @@ -2228,6 +2230,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) end_unlink: ext4_journal_stop(handle); brelse(bh); + trace_ext4_unlink_exit(dentry, retval); return retval; } @@ -2402,6 +2405,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_inode && new_dir != old_dir && EXT4_DIR_LINK_MAX(new_dir)) goto end_rename; + BUFFER_TRACE(dir_bh, "get_write_access"); + retval = ext4_journal_get_write_access(handle, dir_bh); + if (retval) + goto end_rename; } if (!new_bh) { retval = ext4_add_entry(handle, new_dentry, old_inode); @@ -2409,7 +2416,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } else { BUFFER_TRACE(new_bh, "get write access"); - ext4_journal_get_write_access(handle, new_bh); + retval = ext4_journal_get_write_access(handle, new_bh); + if (retval) + goto end_rename; new_de->inode = cpu_to_le32(old_inode->i_ino); if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) @@ -2470,8 +2479,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); ext4_update_dx_flag(old_dir); if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); - ext4_journal_get_write_access(handle, dir_bh); PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index e2cd90e4bb7..b6dbd056fcb 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -259,6 +259,11 @@ static void ext4_end_bio(struct bio *bio, int error) bi_sector >> (inode->i_blkbits - 9)); } + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + ext4_free_io_end(io_end); + return; + } + /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); @@ -279,9 +284,9 @@ void ext4_io_submit(struct ext4_io_submit *io) BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); bio_put(io->io_bio); } - io->io_bio = 0; + io->io_bio = NULL; io->io_op = 0; - io->io_end = 0; + io->io_end = NULL; } static int io_submit_init(struct ext4_io_submit *io, @@ -380,8 +385,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - set_page_writeback(page); - ClearPageError(page); io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); if (!io_page) { @@ -392,6 +395,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, io_page->p_page = page; atomic_set(&io_page->p_count, 1); get_page(page); + set_page_writeback(page); + ClearPageError(page); for (bh = head = page_buffers(page), block_start = 0; bh != head || !block_start; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3ecc6e45d2f..80bbc9c60c2 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -230,7 +230,7 @@ static int setup_new_group_blocks(struct super_block *sb, } /* Zero out all of the reserved backup group descriptor table blocks */ - ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", block, sbi->s_itb_per_group); err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, GFP_NOFS); @@ -248,7 +248,7 @@ static int setup_new_group_blocks(struct super_block *sb, /* Zero out all of the inode table blocks */ block = input->inode_table; - ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", block, sbi->s_itb_per_group); err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) @@ -499,12 +499,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: - /* ext4_journal_release_buffer(handle, iloc.bh); */ + /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: - /* ext4_journal_release_buffer(handle, dind); */ + /* ext4_handle_release_buffer(handle, dind); */ exit_sbh: - /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ + /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -586,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, /* int j; for (j = 0; j < i; j++) - ext4_journal_release_buffer(handle, primary[j]); + ext4_handle_release_buffer(handle, primary[j]); */ goto exit_bh; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 203f9e4a70b..22546ad7f0a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -54,9 +54,9 @@ static struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; -struct ext4_lazy_init *ext4_li_info; -struct mutex ext4_li_mtx; -struct ext4_features *ext4_feat; +static struct ext4_lazy_init *ext4_li_info; +static struct mutex ext4_li_mtx; +static struct ext4_features *ext4_feat; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -75,6 +75,7 @@ static void ext4_write_super(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); +static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); @@ -594,7 +595,7 @@ __acquires(bitlock) vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) printk(KERN_CONT "inode %lu: ", ino); @@ -997,13 +998,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, OLDALLOC)) seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT4_FS_XATTR - if (test_opt(sb, XATTR_USER) && - !(def_mount_opts & EXT4_DEFM_XATTR_USER)) + if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); - if (!test_opt(sb, XATTR_USER) && - (def_mount_opts & EXT4_DEFM_XATTR_USER)) { + if (!test_opt(sb, XATTR_USER)) seq_puts(seq, ",nouser_xattr"); - } #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) @@ -1041,8 +1039,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) !(def_mount_opts & EXT4_DEFM_NODELALLOC)) seq_puts(seq, ",nodelalloc"); - if (test_opt(sb, MBLK_IO_SUBMIT)) - seq_puts(seq, ",mblk_io_submit"); + if (!test_opt(sb, MBLK_IO_SUBMIT)) + seq_puts(seq, ",nomblk_io_submit"); if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -1451,7 +1449,7 @@ static int parse_options(char *options, struct super_block *sb, * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ - args[0].to = args[0].from = 0; + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: @@ -1771,7 +1769,7 @@ set_qf_format: return 0; if (option < 0 || option > (1 << 30)) return 0; - if (!is_power_of_2(option)) { + if (option && !is_power_of_2(option)) { ext4_msg(sb, KERN_ERR, "EXT4-fs: inode_readahead_blks" " must be a power of 2"); @@ -2120,6 +2118,13 @@ static void ext4_orphan_cleanup(struct super_block *sb, return; } + /* Check if feature set would not allow a r/w mount */ + if (!ext4_feature_set_ok(sb, 0)) { + ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { if (es->s_last_orphan) jbd_debug(1, "Errors on filesystem, " @@ -2412,7 +2417,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, if (parse_strtoul(buf, 0x40000000, &t)) return -EINVAL; - if (!is_power_of_2(t)) + if (t && !is_power_of_2(t)) return -EINVAL; sbi->s_inode_readahead_blks = t; @@ -3095,14 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); + /* xattr user namespace & acls are now defaulted on */ #ifdef CONFIG_EXT4_FS_XATTR - if (def_mount_opts & EXT4_DEFM_XATTR_USER) - set_opt(sb, XATTR_USER); + set_opt(sb, XATTR_USER); #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL - if (def_mount_opts & EXT4_DEFM_ACL) - set_opt(sb, POSIX_ACL); + set_opt(sb, POSIX_ACL); #endif + set_opt(sb, MBLK_IO_SUBMIT); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) @@ -3516,7 +3521,7 @@ no_journal: * concurrency isn't really necessary. Limit it to 1. */ EXT4_SB(sb)->dio_unwritten_wq = - alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1); + alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); goto failed_mount_wq; @@ -3531,17 +3536,16 @@ no_journal: if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); ret = PTR_ERR(root); + root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { - iput(root); ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); goto failed_mount4; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); - iput(root); ret = -ENOMEM; goto failed_mount4; } @@ -3657,6 +3661,8 @@ cantfind_ext4: goto failed_mount; failed_mount4: + iput(root); + sb->s_root = NULL; ext4_msg(sb, KERN_ERR, "mount failed"); destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); failed_mount_wq: diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fc32176eee3..b545ca1c459 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -735,7 +735,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - jbd2_journal_release_buffer(handle, bs->bh); + ext4_handle_release_buffer(handle, bs->bh); if (ce) { mb_cache_entry_release(ce); ce = NULL; @@ -833,7 +833,7 @@ inserted: new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: - ext4_free_blocks(handle, inode, 0, block, 1, + ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); error = -EIO; goto cleanup; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 59c6e495678..b5ed541fb13 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -176,6 +176,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) } /* + * Remove the inode from the writeback list it is on. + */ +void inode_wb_list_del(struct inode *inode) +{ + spin_lock(&inode_wb_list_lock); + list_del_init(&inode->i_wb_list); + spin_unlock(&inode_wb_list_lock); +} + + +/* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * @@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { /* - * Prevent speculative execution through spin_unlock(&inode_lock); + * Prevent speculative execution through + * spin_unlock(&inode_wb_list_lock); */ + smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } @@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { + assert_spin_locked(&inode_wb_list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -306,25 +322,25 @@ static void inode_wait_for_writeback(struct inode *inode) wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - while (inode->i_state & I_SYNC) { - spin_unlock(&inode_lock); + while (inode->i_state & I_SYNC) { + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_lock. Either the - * caller has ref on the inode (either via __iget or via syscall against an fd) - * or the inode has I_WILL_FREE set (via generic_forget_inode) + * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * inode->i_lock. Either the caller has an active reference on the inode or + * the inode has I_WILL_FREE set. * * If `wait' is set, wait on the writeout. * * The whole writeout design is quite complex and fragile. We want to avoid * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. - * - * Called under inode_lock. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -333,6 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) unsigned dirty; int ret; + assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&inode->i_lock); + if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else @@ -363,7 +382,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); ret = do_writepages(mapping, wbc); @@ -383,10 +403,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * due to delalloc, clear dirty metadata flags right before * write_inode() */ - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -394,7 +414,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { @@ -506,7 +527,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * kind does not need peridic writeout yet, and for the latter * kind writeout is handled by the freer. */ + spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); requeue_io(inode); continue; } @@ -515,10 +538,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, wbc->wb_start)) + if (inode_dirtied_after(inode, wbc->wb_start)) { + spin_unlock(&inode->i_lock); return 1; + } __iget(inode); + pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); if (wbc->pages_skipped != pages_skipped) { @@ -528,10 +554,11 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, */ redirty_tail(inode); } - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); iput(inode); cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; return 1; @@ -550,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -568,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); /* Leave any unwritten inodes on b_io */ } @@ -577,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } /* @@ -720,13 +747,15 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); + spin_lock(&inode->i_lock); inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } return wrote; @@ -992,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; - bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -1016,7 +1044,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1028,7 +1056,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * superblock list, based upon its state. */ if (inode->i_state & I_SYNC) - goto out; + goto out_unlock_inode; /* * Only add valid (hashed) inodes to the superblock's @@ -1036,16 +1064,17 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) - goto out; + goto out_unlock_inode; } if (inode->i_state & I_FREEING) - goto out; + goto out_unlock_inode; /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + bool wakeup_bdi = false; bdi = inode_to_bdi(inode); if (bdi_cap_writeback_dirty(bdi)) { @@ -1062,15 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = true; } + spin_unlock(&inode->i_lock); + spin_lock(&inode_wb_list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + spin_unlock(&inode_wb_list_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); + return; } } -out: - spin_unlock(&inode_lock); +out_unlock_inode: + spin_unlock(&inode->i_lock); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -1101,7 +1135,7 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); /* * Data integrity sync. Must wait for all pages under writeback, @@ -1111,22 +1145,25 @@ static void wait_sb_inodes(struct super_block *sb) * we still have to wait for that writeout. */ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; + struct address_space *mapping = inode->i_mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. */ iput(old_inode); old_inode = inode; @@ -1135,9 +1172,9 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); } @@ -1271,9 +1308,11 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); ret = writeback_single_inode(inode, &wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1295,9 +1334,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); ret = writeback_single_inode(inode, wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/inode.c b/fs/inode.c index 0b3da4a7770..5f4e11aaeb5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -26,6 +26,38 @@ #include <linux/posix_acl.h> #include <linux/ima.h> #include <linux/cred.h> +#include "internal.h" + +/* + * inode locking rules. + * + * inode->i_lock protects: + * inode->i_state, inode->i_hash, __iget() + * inode_lru_lock protects: + * inode_lru, inode->i_lru + * inode_sb_list_lock protects: + * sb->s_inodes, inode->i_sb_list + * inode_wb_list_lock protects: + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * inode_hash_lock protects: + * inode_hashtable, inode->i_hash + * + * Lock ordering: + * + * inode_sb_list_lock + * inode->i_lock + * inode_lru_lock + * + * inode_wb_list_lock + * inode->i_lock + * + * inode_hash_lock + * inode_sb_list_lock + * inode->i_lock + * + * iunique_lock + * inode_hash_lock + */ /* * This is needed for the following functions: @@ -60,6 +92,8 @@ static unsigned int i_hash_mask __read_mostly; static unsigned int i_hash_shift __read_mostly; +static struct hlist_head *inode_hashtable __read_mostly; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* * Each inode can be on two separate lists. One is @@ -74,15 +108,10 @@ static unsigned int i_hash_shift __read_mostly; */ static LIST_HEAD(inode_lru); -static struct hlist_head *inode_hashtable __read_mostly; +static DEFINE_SPINLOCK(inode_lru_lock); -/* - * A simple spinlock to protect the list manipulations. - * - * NOTE! You also have to own the lock if you change - * the i_state of an inode while it is in use.. - */ -DEFINE_SPINLOCK(inode_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * iprune_sem provides exclusion between the icache shrinking and the @@ -137,15 +166,6 @@ int proc_nr_inodes(ctl_table *table, int write, } #endif -static void wake_up_inode(struct inode *inode) -{ - /* - * Prevent speculative execution through spin_unlock(&inode_lock); - */ - smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); -} - /** * inode_init_always - perform inode structure intialisation * @sb: superblock inode belongs to @@ -336,7 +356,7 @@ static void init_once(void *foo) } /* - * inode_lock must be held + * inode->i_lock must be held */ void __iget(struct inode *inode) { @@ -354,23 +374,22 @@ EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { + spin_lock(&inode_lru_lock); if (list_empty(&inode->i_lru)) { list_add(&inode->i_lru, &inode_lru); inodes_stat.nr_unused++; } + spin_unlock(&inode_lru_lock); } static void inode_lru_list_del(struct inode *inode) { + spin_lock(&inode_lru_lock); if (!list_empty(&inode->i_lru)) { list_del_init(&inode->i_lru); inodes_stat.nr_unused--; } -} - -static inline void __inode_sb_list_add(struct inode *inode) -{ - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); + spin_unlock(&inode_lru_lock); } /** @@ -379,15 +398,17 @@ static inline void __inode_sb_list_add(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode_lock); - __inode_sb_list_add(inode); - spin_unlock(&inode_lock); + spin_lock(&inode_sb_list_lock); + list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); + spin_unlock(&inode_sb_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); -static inline void __inode_sb_list_del(struct inode *inode) +static inline void inode_sb_list_del(struct inode *inode) { + spin_lock(&inode_sb_list_lock); list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -412,24 +433,15 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); hlist_add_head(&inode->i_hash, b); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__insert_inode_hash); /** - * __remove_inode_hash - remove an inode from the hash - * @inode: inode to unhash - * - * Remove an inode from the superblock. - */ -static void __remove_inode_hash(struct inode *inode) -{ - hlist_del_init(&inode->i_hash); -} - -/** * remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * @@ -437,9 +449,11 @@ static void __remove_inode_hash(struct inode *inode) */ void remove_inode_hash(struct inode *inode) { - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); hlist_del_init(&inode->i_hash); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(remove_inode_hash); @@ -456,10 +470,29 @@ void end_writeback(struct inode *inode) } EXPORT_SYMBOL(end_writeback); +/* + * Free the inode passed in, removing it from the lists it is still connected + * to. We remove any pages still attached to the inode and wait for any IO that + * is still in progress before finally destroying the inode. + * + * An inode must already be marked I_FREEING so that we avoid the inode being + * moved back onto lists if we race with other code that manipulates the lists + * (e.g. writeback_single_inode). The caller is responsible for setting this. + * + * An inode must already be removed from the LRU list before being evicted from + * the cache. This should occur atomically with setting the I_FREEING state + * flag, so no inodes here should ever be on the LRU when being evicted. + */ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(!list_empty(&inode->i_lru)); + + inode_wb_list_del(inode); + inode_sb_list_del(inode); + if (op->evict_inode) { op->evict_inode(inode); } else { @@ -471,6 +504,15 @@ static void evict(struct inode *inode) bd_forget(inode); if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); + + remove_inode_hash(inode); + + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); + BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + spin_unlock(&inode->i_lock); + + destroy_inode(inode); } /* @@ -489,14 +531,6 @@ static void dispose_list(struct list_head *head) list_del_init(&inode->i_lru); evict(inode); - - spin_lock(&inode_lock); - __remove_inode_hash(inode); - __inode_sb_list_del(inode); - spin_unlock(&inode_lock); - - wake_up_inode(inode); - destroy_inode(inode); } } @@ -514,25 +548,23 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); continue; + } inode->i_state |= I_FREEING; - - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ - list_move(&inode->i_lru, &dispose); - list_del_init(&inode->i_wb_list); - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); dispose_list(&dispose); @@ -561,31 +593,30 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & I_DIRTY && !kill_dirty) { + spin_unlock(&inode->i_lock); busy = 1; continue; } if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); busy = 1; continue; } inode->i_state |= I_FREEING; - - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ - list_move(&inode->i_lru, &dispose); - list_del_init(&inode->i_wb_list); - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); dispose_list(&dispose); @@ -607,7 +638,7 @@ static int can_unuse(struct inode *inode) /* * Scan `goal' inodes on the unused list for freeable ones. They are moved to a - * temporary list and then are freed outside inode_lock by dispose_list(). + * temporary list and then are freed outside inode_lru_lock by dispose_list(). * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -628,7 +659,7 @@ static void prune_icache(int nr_to_scan) unsigned long reap = 0; down_read(&iprune_sem); - spin_lock(&inode_lock); + spin_lock(&inode_lru_lock); for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; @@ -638,53 +669,67 @@ static void prune_icache(int nr_to_scan) inode = list_entry(inode_lru.prev, struct inode, i_lru); /* + * we are inverting the inode_lru_lock/inode->i_lock here, + * so use a trylock. If we fail to get the lock, just move the + * inode to the back of the list so we don't spin on it. + */ + if (!spin_trylock(&inode->i_lock)) { + list_move(&inode->i_lru, &inode_lru); + continue; + } + + /* * Referenced or dirty inodes are still in use. Give them * another pass through the LRU as we canot reclaim them now. */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { list_del_init(&inode->i_lru); + spin_unlock(&inode->i_lock); inodes_stat.nr_unused--; continue; } /* recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { - list_move(&inode->i_lru, &inode_lru); inode->i_state &= ~I_REFERENCED; + list_move(&inode->i_lru, &inode_lru); + spin_unlock(&inode->i_lock); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_lru_lock); if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, 0, -1); iput(inode); - spin_lock(&inode_lock); + spin_lock(&inode_lru_lock); if (inode != list_entry(inode_lru.next, struct inode, i_lru)) continue; /* wrong inode or list_empty */ - if (!can_unuse(inode)) + /* avoid lock inversions with trylock */ + if (!spin_trylock(&inode->i_lock)) + continue; + if (!can_unuse(inode)) { + spin_unlock(&inode->i_lock); continue; + } } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ list_move(&inode->i_lru, &freeable); - list_del_init(&inode->i_wb_list); inodes_stat.nr_unused--; } if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); - spin_unlock(&inode_lock); + spin_unlock(&inode_lru_lock); dispose_list(&freeable); up_read(&iprune_sem); @@ -733,15 +778,21 @@ static struct inode *find_inode(struct super_block *sb, repeat: hlist_for_each_entry(inode, node, head, i_hash) { - if (inode->i_sb != sb) + spin_lock(&inode->i_lock); + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); continue; - if (!test(inode, data)) + } + if (!test(inode, data)) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } __iget(inode); + spin_unlock(&inode->i_lock); return inode; } return NULL; @@ -759,15 +810,21 @@ static struct inode *find_inode_fast(struct super_block *sb, repeat: hlist_for_each_entry(inode, node, head, i_hash) { - if (inode->i_ino != ino) + spin_lock(&inode->i_lock); + if (inode->i_ino != ino) { + spin_unlock(&inode->i_lock); continue; - if (inode->i_sb != sb) + } + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } __iget(inode); + spin_unlock(&inode->i_lock); return inode; } return NULL; @@ -827,19 +884,26 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&inode_lock); + spin_lock_prefetch(&inode_sb_list_lock); inode = alloc_inode(sb); if (inode) { - spin_lock(&inode_lock); - __inode_sb_list_add(inode); + spin_lock(&inode->i_lock); inode->i_state = 0; - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); } return inode; } EXPORT_SYMBOL(new_inode); +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ void unlock_new_inode(struct inode *inode) { #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -859,51 +923,67 @@ void unlock_new_inode(struct inode *inode) } } #endif - /* - * This is special! We do not need the spinlock when clearing I_NEW, - * because we're guaranteed that nobody else tries to do anything about - * the state of the inode when it is locked, as we just created it (so - * there can be no old holders that haven't tested I_NEW). - * However we must emit the memory barrier so that other CPUs reliably - * see the clearing of I_NEW after the other inode initialisation has - * completed. - */ - smp_mb(); + spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; - wake_up_inode(inode); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); -/* - * This is called without the inode lock held.. Be careful. +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if present it is return it with an increased reference count. This is + * a generalized version of iget_locked() for file systems where the inode + * number is not sufficient for unique identification of an inode. * - * We no longer cache the sb_flags in i_flags - see fs.h - * -- rmk@arm.uk.linux.org + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. */ -static struct inode *get_new_inode(struct super_block *sb, - struct hlist_head *head, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), - void *data) +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) { + struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); + + if (inode) { + wait_on_inode(inode); + return inode; + } + inode = alloc_inode(sb); if (inode) { struct inode *old; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode(sb, head, test, data); if (!old) { if (set(inode, data)) goto set_failed; - hlist_add_head(&inode->i_hash, head); - __inode_sb_list_add(inode); + spin_lock(&inode->i_lock); inode->i_state = I_NEW; - spin_unlock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -916,7 +996,7 @@ static struct inode *get_new_inode(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); @@ -924,33 +1004,53 @@ static struct inode *get_new_inode(struct super_block *sb, return inode; set_failed: - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); return NULL; } +EXPORT_SYMBOL(iget5_locked); -/* - * get_new_inode_fast is the fast path version of get_new_inode, see the - * comment at iget_locked for details. +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * Search for the inode specified by @ino in the inode cache and if present + * return it with an increased reference count. This is for file systems + * where the inode number is sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). */ -static struct inode *get_new_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) +struct inode *iget_locked(struct super_block *sb, unsigned long ino) { + struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); + if (inode) { + wait_on_inode(inode); + return inode; + } + inode = alloc_inode(sb); if (inode) { struct inode *old; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; - hlist_add_head(&inode->i_hash, head); - __inode_sb_list_add(inode); + spin_lock(&inode->i_lock); inode->i_state = I_NEW; - spin_unlock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -963,13 +1063,14 @@ static struct inode *get_new_inode_fast(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); } return inode; } +EXPORT_SYMBOL(iget_locked); /* * search the inode cache for a matching inode number. @@ -984,10 +1085,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino) struct hlist_node *node; struct inode *inode; + spin_lock(&inode_hash_lock); hlist_for_each_entry(inode, node, b, i_hash) { - if (inode->i_ino == ino && inode->i_sb == sb) + if (inode->i_ino == ino && inode->i_sb == sb) { + spin_unlock(&inode_hash_lock); return 0; + } } + spin_unlock(&inode_hash_lock); return 1; } @@ -1017,7 +1122,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) static unsigned int counter; ino_t res; - spin_lock(&inode_lock); spin_lock(&iunique_lock); do { if (counter <= max_reserved) @@ -1025,7 +1129,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); - spin_unlock(&inode_lock); return res; } @@ -1033,116 +1136,50 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { - spin_lock(&inode_lock); - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) + spin_lock(&inode->i_lock); + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); - else + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); /* * Handle the case where s_op->clear_inode is not been * called yet, and somebody is calling igrab * while the inode is getting freed. */ inode = NULL; - spin_unlock(&inode_lock); + } return inode; } EXPORT_SYMBOL(igrab); /** - * ifind - internal function, you want ilookup5() or iget5(). - * @sb: super block of file system to search - * @head: the head of the list to search - * @test: callback used for comparisons between inodes - * @data: opaque data pointer to pass to @test - * @wait: if true wait for the inode to be unlocked, if false do not - * - * ifind() searches for the inode specified by @data in the inode - * cache. This is a generalized version of ifind_fast() for file systems where - * the inode number is not sufficient for unique identification of an inode. - * - * If the inode is in the cache, the inode is returned with an incremented - * reference count. - * - * Otherwise NULL is returned. - * - * Note, @test is called with the inode_lock held, so can't sleep. - */ -static struct inode *ifind(struct super_block *sb, - struct hlist_head *head, int (*test)(struct inode *, void *), - void *data, const int wait) -{ - struct inode *inode; - - spin_lock(&inode_lock); - inode = find_inode(sb, head, test, data); - if (inode) { - spin_unlock(&inode_lock); - if (likely(wait)) - wait_on_inode(inode); - return inode; - } - spin_unlock(&inode_lock); - return NULL; -} - -/** - * ifind_fast - internal function, you want ilookup() or iget(). - * @sb: super block of file system to search - * @head: head of the list to search - * @ino: inode number to search for - * - * ifind_fast() searches for the inode @ino in the inode cache. This is for - * file systems where the inode number is sufficient for unique identification - * of an inode. - * - * If the inode is in the cache, the inode is returned with an incremented - * reference count. - * - * Otherwise NULL is returned. - */ -static struct inode *ifind_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) -{ - struct inode *inode; - - spin_lock(&inode_lock); - inode = find_inode_fast(sb, head, ino); - if (inode) { - spin_unlock(&inode_lock); - wait_on_inode(inode); - return inode; - } - spin_unlock(&inode_lock); - return NULL; -} - -/** * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * - * ilookup5() uses ifind() to search for the inode specified by @hashval and - * @data in the inode cache. This is a generalized version of ilookup() for - * file systems where the inode number is not sufficient for unique - * identification of an inode. - * + * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented - * reference count. Note, the inode lock is not waited upon so you have to be - * very careful what you do with the returned inode. You probably should be - * using ilookup5() instead. + * reference count. * - * Otherwise NULL is returned. + * Note: I_NEW is not waited upon so you have to be very careful what you do + * with the returned inode. You probably should be using ilookup5() instead. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); - return ifind(sb, head, test, data, 0); + return inode; } EXPORT_SYMBOL(ilookup5_nowait); @@ -1153,24 +1190,24 @@ EXPORT_SYMBOL(ilookup5_nowait); * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * - * ilookup5() uses ifind() to search for the inode specified by @hashval and - * @data in the inode cache. This is a generalized version of ilookup() for - * file systems where the inode number is not sufficient for unique - * identification of an inode. - * - * If the inode is in the cache, the inode lock is waited upon and the inode is + * Search for the inode specified by @hashval and @data in the inode cache, + * and if the inode is in the cache, return the inode with an incremented + * reference count. Waits on I_NEW before returning the inode. * returned with an incremented reference count. * - * Otherwise NULL is returned. + * This is a generalized version of ilookup() for file systems where the + * inode number is not sufficient for unique identification of an inode. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode = ilookup5_nowait(sb, hashval, test, data); - return ifind(sb, head, test, data, 1); + if (inode) + wait_on_inode(inode); + return inode; } EXPORT_SYMBOL(ilookup5); @@ -1179,91 +1216,23 @@ EXPORT_SYMBOL(ilookup5); * @sb: super block of file system to search * @ino: inode number to search for * - * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. - * This is for file systems where the inode number is sufficient for unique - * identification of an inode. - * - * If the inode is in the cache, the inode is returned with an incremented - * reference count. - * - * Otherwise NULL is returned. + * Search for the inode @ino in the inode cache, and if the inode is in the + * cache, the inode is returned with an incremented reference count. */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); - - return ifind_fast(sb, head, ino); -} -EXPORT_SYMBOL(ilookup); - -/** - * iget5_locked - obtain an inode from a mounted file system - * @sb: super block of file system - * @hashval: hash value (usually inode number) to get - * @test: callback used for comparisons between inodes - * @set: callback used to initialize a new struct inode - * @data: opaque data pointer to pass to @test and @set - * - * iget5_locked() uses ifind() to search for the inode specified by @hashval - * and @data in the inode cache and if present it is returned with an increased - * reference count. This is a generalized version of iget_locked() for file - * systems where the inode number is not sufficient for unique identification - * of an inode. - * - * If the inode is not in cache, get_new_inode() is called to allocate a new - * inode and this is returned locked, hashed, and with the I_NEW flag set. The - * file system gets to fill it in before unlocking it via unlock_new_inode(). - * - * Note both @test and @set are called with the inode_lock held, so can't sleep. - */ -struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *data) -{ - struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; - inode = ifind(sb, head, test, data, 1); - if (inode) - return inode; - /* - * get_new_inode() will do the right thing, re-trying the search - * in case it had to block at any point. - */ - return get_new_inode(sb, head, test, set, data); -} -EXPORT_SYMBOL(iget5_locked); - -/** - * iget_locked - obtain an inode from a mounted file system - * @sb: super block of file system - * @ino: inode number to get - * - * iget_locked() uses ifind_fast() to search for the inode specified by @ino in - * the inode cache and if present it is returned with an increased reference - * count. This is for file systems where the inode number is sufficient for - * unique identification of an inode. - * - * If the inode is not in cache, get_new_inode_fast() is called to allocate a - * new inode and this is returned locked, hashed, and with the I_NEW flag set. - * The file system gets to fill it in before unlocking it via - * unlock_new_inode(). - */ -struct inode *iget_locked(struct super_block *sb, unsigned long ino) -{ - struct hlist_head *head = inode_hashtable + hash(sb, ino); - struct inode *inode; + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); - inode = ifind_fast(sb, head, ino); if (inode) - return inode; - /* - * get_new_inode_fast() will do the right thing, re-trying the search - * in case it had to block at any point. - */ - return get_new_inode_fast(sb, head, ino); + wait_on_inode(inode); + return inode; } -EXPORT_SYMBOL(iget_locked); +EXPORT_SYMBOL(ilookup); int insert_inode_locked(struct inode *inode) { @@ -1271,27 +1240,33 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - inode->i_state |= I_NEW; while (1) { struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; - if (old->i_state & (I_FREEING|I_WILL_FREE)) + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); continue; + } break; } if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); return 0; } __iget(old); - spin_unlock(&inode_lock); + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -1308,29 +1283,34 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - inode->i_state |= I_NEW; - while (1) { struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_sb != sb) continue; if (!test(old, data)) continue; - if (old->i_state & (I_FREEING|I_WILL_FREE)) + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); continue; + } break; } if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); return 0; } __iget(old); - spin_unlock(&inode_lock); + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -1375,47 +1355,35 @@ static void iput_final(struct inode *inode) const struct super_operations *op = inode->i_sb->s_op; int drop; + WARN_ON(inode->i_state & I_NEW); + if (op && op->drop_inode) drop = op->drop_inode(inode); else drop = generic_drop_inode(inode); + if (!drop && (sb->s_flags & MS_ACTIVE)) { + inode->i_state |= I_REFERENCED; + if (!(inode->i_state & (I_DIRTY|I_SYNC))) + inode_lru_list_add(inode); + spin_unlock(&inode->i_lock); + return; + } + if (!drop) { - if (sb->s_flags & MS_ACTIVE) { - inode->i_state |= I_REFERENCED; - if (!(inode->i_state & (I_DIRTY|I_SYNC))) { - inode_lru_list_add(inode); - } - spin_unlock(&inode_lock); - return; - } - WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); write_inode_now(inode, 1); - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; - __remove_inode_hash(inode); } - WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ inode_lru_list_del(inode); - list_del_init(&inode->i_wb_list); + spin_unlock(&inode->i_lock); - __inode_sb_list_del(inode); - spin_unlock(&inode_lock); evict(inode); - remove_inode_hash(inode); - wake_up_inode(inode); - BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); - destroy_inode(inode); } /** @@ -1432,7 +1400,7 @@ void iput(struct inode *inode) if (inode) { BUG_ON(inode->i_state & I_CLEAR); - if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) iput_final(inode); } } @@ -1611,9 +1579,8 @@ EXPORT_SYMBOL(inode_wait); * to recheck inode state. * * It doesn't matter if I_NEW is not set initially, a call to - * wake_up_inode() after removing from the hash list will DTRT. - * - * This is called with inode_lock held. + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list + * will DTRT. */ static void __wait_on_freeing_inode(struct inode *inode) { @@ -1621,10 +1588,11 @@ static void __wait_on_freeing_inode(struct inode *inode) DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq, &wait.wait); - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); } static __initdata unsigned long ihash_entries; diff --git a/fs/internal.h b/fs/internal.h index 8318059b42c..b29c46e4e32 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,6 +125,13 @@ extern long do_handle_open(int mountdirfd, /* * inode.c */ +extern spinlock_t inode_sb_list_lock; + +/* + * fs-writeback.c + */ +extern void inode_wb_list_del(struct inode *inode); + extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 4f9cc048294..3e93cdd1900 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -31,7 +31,7 @@ * is used to release xattr name/value pair and detach from c->xattrindex. * reclaim_xattr_datum(c) * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when - * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold + * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold * is hard coded as 32KiB. * do_verify_xattr_datum(c, xd) * is used to load the xdatum informations without name/value pair from the medium. diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 03b8c240aed..edfea7a3a74 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc) return ret; } -/* called with inode_lock held */ +/* called with inode->i_lock held */ static int logfs_drop_inode(struct inode *inode) { struct logfs_super *super = logfs_super(inode->i_sb); diff --git a/fs/namei.c b/fs/namei.c index d0066e17d45..3cb616d38d9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -992,6 +992,12 @@ int follow_down_one(struct path *path) return 0; } +static inline bool managed_dentry_might_block(struct dentry *dentry) +{ + return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && + dentry->d_op->d_manage(dentry, true) < 0); +} + /* * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we * meet a managed dentry and we're not walking to "..". True is returned to @@ -1000,19 +1006,26 @@ int follow_down_one(struct path *path) static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct inode **inode, bool reverse_transit) { - while (d_mountpoint(path->dentry)) { + for (;;) { struct vfsmount *mounted; - if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && - !reverse_transit && - path->dentry->d_op->d_manage(path->dentry, true) < 0) + /* + * Don't forget we might have a non-mountpoint managed dentry + * that wants to block transit. + */ + *inode = path->dentry->d_inode; + if (!reverse_transit && + unlikely(managed_dentry_might_block(path->dentry))) return false; + + if (!d_mountpoint(path->dentry)) + break; + mounted = __lookup_mnt(path->mnt, path->dentry, 1); if (!mounted) break; path->mnt = mounted; path->dentry = mounted->mnt_root; nd->seq = read_seqcount_begin(&path->dentry->d_seq); - *inode = path->dentry->d_inode; } if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index abdf38d5971..7237672216c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -44,6 +44,7 @@ /* #define NFS_DEBUG_VERBOSE 1 */ static int nfs_opendir(struct inode *, struct file *); +static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); @@ -64,7 +65,7 @@ const struct file_operations nfs_dir_operations = { .read = generic_read_dir, .readdir = nfs_readdir, .open = nfs_opendir, - .release = nfs_release, + .release = nfs_closedir, .fsync = nfs_fsync_dir, }; @@ -133,13 +134,35 @@ const struct inode_operations nfs4_dir_inode_operations = { #endif /* CONFIG_NFS_V4 */ +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) +{ + struct nfs_open_dir_context *ctx; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + ctx->duped = 0; + ctx->dir_cookie = 0; + ctx->dup_cookie = 0; + ctx->cred = get_rpccred(cred); + } else + ctx = ERR_PTR(-ENOMEM); + return ctx; +} + +static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) +{ + put_rpccred(ctx->cred); + kfree(ctx); +} + /* * Open file */ static int nfs_opendir(struct inode *inode, struct file *filp) { - int res; + int res = 0; + struct nfs_open_dir_context *ctx; + struct rpc_cred *cred; dfprintk(FILE, "NFS: open dir(%s/%s)\n", filp->f_path.dentry->d_parent->d_name.name, @@ -147,8 +170,15 @@ nfs_opendir(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSOPEN); - /* Call generic open code in order to cache credentials */ - res = nfs_open(inode, filp); + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + ctx = alloc_nfs_open_dir_context(cred); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + filp->private_data = ctx; if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { /* This is a mountpoint, so d_revalidate will never * have been called, so we need to refresh the @@ -156,9 +186,18 @@ nfs_opendir(struct inode *inode, struct file *filp) */ __nfs_revalidate_inode(NFS_SERVER(inode), inode); } +out: + put_rpccred(cred); return res; } +static int +nfs_closedir(struct inode *inode, struct file *filp) +{ + put_nfs_open_dir_context(filp->private_data); + return 0; +} + struct nfs_cache_array_entry { u64 cookie; u64 ino; @@ -284,19 +323,20 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri { loff_t diff = desc->file->f_pos - desc->current_index; unsigned int index; + struct nfs_open_dir_context *ctx = desc->file->private_data; if (diff < 0) goto out_eof; if (diff >= array->size) { if (array->eof_index >= 0) goto out_eof; - desc->current_index += array->size; return -EAGAIN; } index = (unsigned int)diff; *desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; + ctx->duped = 0; return 0; out_eof: desc->eof = 1; @@ -307,10 +347,18 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) { int i; + loff_t new_pos; int status = -EAGAIN; + struct nfs_open_dir_context *ctx = desc->file->private_data; for (i = 0; i < array->size; i++) { if (array->array[i].cookie == *desc->dir_cookie) { + new_pos = desc->current_index + i; + if (new_pos < desc->file->f_pos) { + ctx->dup_cookie = *desc->dir_cookie; + ctx->duped = 1; + } + desc->file->f_pos = new_pos; desc->cache_entry_index = i; return 0; } @@ -342,6 +390,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) if (status == -EAGAIN) { desc->last_cookie = array->last_cookie; + desc->current_index += array->size; desc->page_index++; } nfs_readdir_release_array(desc->page); @@ -354,7 +403,8 @@ static int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct file *file, struct inode *inode) { - struct rpc_cred *cred = nfs_file_cred(file); + struct nfs_open_dir_context *ctx = file->private_data; + struct rpc_cred *cred = ctx->cred; unsigned long timestamp, gencount; int error; @@ -693,6 +743,20 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, int i = 0; int res = 0; struct nfs_cache_array *array = NULL; + struct nfs_open_dir_context *ctx = file->private_data; + + if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) { + if (printk_ratelimit()) { + pr_notice("NFS: directory %s/%s contains a readdir loop. " + "Please contact your server vendor. " + "Offending cookie: %llu\n", + file->f_dentry->d_parent->d_name.name, + file->f_dentry->d_name.name, + *desc->dir_cookie); + } + res = -ELOOP; + goto out; + } array = nfs_readdir_get_array(desc->page); if (IS_ERR(array)) { @@ -785,6 +849,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; + struct nfs_open_dir_context *dir_ctx = filp->private_data; int res; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", @@ -801,7 +866,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) memset(desc, 0, sizeof(*desc)); desc->file = filp; - desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie; + desc->dir_cookie = &dir_ctx->dir_cookie; desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = NFS_USE_READDIRPLUS(inode); @@ -853,6 +918,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", dentry->d_parent->d_name.name, @@ -872,7 +938,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) } if (offset != filp->f_pos) { filp->f_pos = offset; - nfs_file_open_context(filp)->dir_cookie = 0; + dir_ctx->dir_cookie = 0; + dir_ctx->duped = 0; } out: mutex_unlock(&inode->i_mutex); @@ -1068,7 +1135,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) if (fhandle == NULL || fattr == NULL) goto out_error; - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); if (error) goto out_bad; if (nfs_compare_fh(NFS_FH(inode), fhandle)) @@ -1224,7 +1291,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); if (error == -ENOENT) goto no_entry; if (error < 0) { @@ -1562,7 +1629,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, if (dentry->d_inode) goto out; if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); if (error) goto out_error; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d85a534b15c..3ac5bd695e5 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -326,6 +326,9 @@ nfs_file_fsync(struct file *file, int datasync) ret = xchg(&ctx->error, 0); if (!ret && status < 0) ret = status; + if (!ret && !datasync) + /* application has asked for meta-data sync */ + ret = pnfs_layoutcommit_inode(inode, true); return ret; } diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 1084792bc0f..dcb61548887 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -222,6 +222,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh, goto out; } + if (fattr->valid & NFS_ATTR_FATTR_FSID && + !nfs_fsid_equal(&server->fsid, &fattr->fsid)) + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + inode = nfs_fhget(sb, mntfh, fattr); if (IS_ERR(inode)) { dprintk("nfs_get_root: get root inode failed\n"); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 01768e5e2c9..57bb31ad7a5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -254,7 +254,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) struct inode *inode = ERR_PTR(-ENOENT); unsigned long hash; - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) + nfs_attr_check_mountpoint(sb, fattr); + + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; @@ -298,8 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ - if ((fattr->valid & NFS_ATTR_FATTR_FSID) - && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { + if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || + fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else @@ -639,7 +641,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cr ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; - ctx->dir_cookie = 0; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); @@ -1471,6 +1472,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; + atomic_set(&nfsi->commits_outstanding, 0); #endif } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 72e0bddf7a2..ce118ce885d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -39,6 +39,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) return 0; } +static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) +{ + if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) + fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; +} + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; @@ -214,6 +220,7 @@ extern const u32 nfs41_maxwrite_overhead; /* nfs4proc.c */ #ifdef CONFIG_NFS_V4 extern struct rpc_procinfo nfs4_procedures[]; +void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *); #endif extern int nfs4_init_ds_session(struct nfs_client *clp); @@ -276,11 +283,25 @@ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern void nfs_commit_free(struct nfs_write_data *p); extern int nfs_initiate_write(struct nfs_write_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops, int how); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +extern int nfs_initiate_commit(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern void nfs_init_commit(struct nfs_write_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg); +void nfs_commit_clear_lock(struct nfs_inode *nfsi); +void nfs_commitdata_release(void *data); +void nfs_commit_release_pages(struct nfs_write_data *data); + #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *); @@ -296,12 +317,14 @@ extern int nfs4_init_client(struct nfs_client *clp, rpc_authflavor_t authflavour, int noresvport); extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); -extern int _nfs4_call_sync(struct nfs_server *server, +extern int _nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply); -extern int _nfs4_call_sync_session(struct nfs_server *server, +extern int _nfs4_call_sync_session(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index bf1c68009ff..9166fcb66da 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -15,6 +15,7 @@ #include <linux/string.h> #include <linux/sunrpc/clnt.h> #include <linux/vfs.h> +#include <linux/sunrpc/gss_api.h> #include "internal.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -27,7 +28,8 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; static struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, - struct nfs_fattr *fattr); + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor); /* * nfs_path - reconstruct the path given an arbitrary dentry @@ -116,6 +118,102 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } +#ifdef CONFIG_NFS_V4 +static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors, struct inode *inode) +{ + struct gss_api_mech *mech; + struct xdr_netobj oid; + int i; + rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; + + for (i = 0; i < flavors->num_flavors; i++) { + struct nfs4_secinfo_flavor *flavor; + flavor = &flavors->flavors[i]; + + if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { + pseudoflavor = flavor->flavor; + break; + } else if (flavor->flavor == RPC_AUTH_GSS) { + oid.len = flavor->gss.sec_oid4.len; + oid.data = flavor->gss.sec_oid4.data; + mech = gss_mech_get_by_OID(&oid); + if (!mech) + continue; + pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); + gss_mech_put(mech); + break; + } + } + + return pseudoflavor; +} + +static rpc_authflavor_t nfs_negotiate_security(const struct dentry *parent, const struct dentry *dentry) +{ + int status = 0; + struct page *page; + struct nfs4_secinfo_flavors *flavors; + int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); + rpc_authflavor_t flavor = RPC_AUTH_UNIX; + + secinfo = NFS_PROTO(parent->d_inode)->secinfo; + if (secinfo != NULL) { + page = alloc_page(GFP_KERNEL); + if (!page) { + status = -ENOMEM; + goto out; + } + flavors = page_address(page); + status = secinfo(parent->d_inode, &dentry->d_name, flavors); + flavor = nfs_find_best_sec(flavors, dentry->d_inode); + put_page(page); + } + + return flavor; + +out: + status = -ENOMEM; + return status; +} + +static rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent, + struct dentry *dentry, struct path *path, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + rpc_authflavor_t flavor; + struct rpc_clnt *clone; + struct rpc_auth *auth; + int err; + + flavor = nfs_negotiate_security(parent, path->dentry); + if (flavor < 0) + goto out; + clone = rpc_clone_client(server->client); + auth = rpcauth_create(flavor, clone); + if (!auth) { + flavor = -EIO; + goto out_shutdown; + } + err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode, + &path->dentry->d_name, + fh, fattr); + if (err < 0) + flavor = err; +out_shutdown: + rpc_shutdown_client(clone); +out: + return flavor; +} +#else /* CONFIG_NFS_V4 */ +static inline rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server, + struct dentry *parent, struct dentry *dentry, + struct path *path, struct nfs_fh *fh, + struct nfs_fattr *fattr) +{ + return -EPERM; +} +#endif /* CONFIG_NFS_V4 */ + /* * nfs_d_automount - Handle crossing a mountpoint on the server * @path - The mountpoint @@ -136,6 +234,7 @@ struct vfsmount *nfs_d_automount(struct path *path) struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; int err; + rpc_authflavor_t flavor = 1; dprintk("--> nfs_d_automount()\n"); @@ -153,9 +252,16 @@ struct vfsmount *nfs_d_automount(struct path *path) /* Look it up again to get its attributes */ parent = dget_parent(path->dentry); - err = server->nfs_client->rpc_ops->lookup(parent->d_inode, + err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode, &path->dentry->d_name, fh, fattr); + if (err == -EPERM) { + flavor = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr); + if (flavor < 0) + err = flavor; + else + err = 0; + } dput(parent); if (err != 0) { mnt = ERR_PTR(err); @@ -165,7 +271,7 @@ struct vfsmount *nfs_d_automount(struct path *path) if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) mnt = nfs_do_refmount(path->dentry); else - mnt = nfs_do_submount(path->dentry, fh, fattr); + mnt = nfs_do_submount(path->dentry, fh, fattr, flavor); if (IS_ERR(mnt)) goto out; @@ -232,17 +338,20 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, * @dentry - parent directory * @fh - filehandle for new root dentry * @fattr - attributes for new root inode + * @authflavor - security flavor to use when performing the mount * */ static struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor) { struct nfs_clone_mount mountdata = { .sb = dentry->d_sb, .dentry = dentry, .fh = fh, .fattr = fattr, + .authflavor = authflavor, }; struct vfsmount *mnt = ERR_PTR(-ENOMEM); char *page = (char *) __get_free_page(GFP_USER); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d0c80d8b3f9..38053d823eb 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -141,7 +141,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs3_proc_lookup(struct inode *dir, struct qstr *name, +nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs3_diropargs arg = { diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c64be1cff08..e1c261ddd65 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -57,7 +57,8 @@ enum nfs4_session_state { struct nfs4_minor_version_ops { u32 minor_version; - int (*call_sync)(struct nfs_server *server, + int (*call_sync)(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -262,6 +263,8 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); +extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, + bool sync); static inline bool is_ds_only_client(struct nfs_client *clp) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 42855846481..6f8192f4cfc 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -154,6 +154,23 @@ static int filelayout_read_done_cb(struct rpc_task *task, } /* + * We reference the rpc_cred of the first WRITE that triggers the need for + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. + * rfc5661 is not clear about which credential should be used. + */ +static void +filelayout_set_layoutcommit(struct nfs_write_data *wdata) +{ + if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds || + wdata->res.verf->committed == NFS_FILE_SYNC) + return; + + pnfs_set_layoutcommit(wdata); + dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, + (unsigned long) wdata->lseg->pls_end_pos); +} + +/* * Call ops for the async read/write cases * In the case of dense layouts, the offset needs to be reset to its * original value. @@ -210,6 +227,38 @@ static int filelayout_write_done_cb(struct rpc_task *task, return -EAGAIN; } + filelayout_set_layoutcommit(data); + return 0; +} + +/* Fake up some data that will cause nfs_commit_release to retry the writes. */ +static void prepare_to_resend_writes(struct nfs_write_data *data) +{ + struct nfs_page *first = nfs_list_entry(data->pages.next); + + data->task.tk_status = 0; + memcpy(data->verf.verifier, first->wb_verf.verifier, + sizeof(first->wb_verf.verifier)); + data->verf.verifier[0]++; /* ensure verifier mismatch */ +} + +static int filelayout_commit_done_cb(struct rpc_task *task, + struct nfs_write_data *data) +{ + int reset = 0; + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + prepare_to_resend_writes(data); + filelayout_set_lo_fail(data->lseg); + } else + nfs_restart_rpc(task, data->ds_clp); + return -EAGAIN; + } + return 0; } @@ -240,6 +289,16 @@ static void filelayout_write_release(void *data) wdata->mds_ops->rpc_release(data); } +static void filelayout_commit_release(void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + nfs_commit_release_pages(wdata); + if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding)) + nfs_commit_clear_lock(NFS_I(wdata->inode)); + nfs_commitdata_release(wdata); +} + struct rpc_call_ops filelayout_read_call_ops = { .rpc_call_prepare = filelayout_read_prepare, .rpc_call_done = filelayout_read_call_done, @@ -252,6 +311,12 @@ struct rpc_call_ops filelayout_write_call_ops = { .rpc_release = filelayout_write_release, }; +struct rpc_call_ops filelayout_commit_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_release = filelayout_commit_release, +}; + static enum pnfs_try_status filelayout_read_pagelist(struct nfs_read_data *data) { @@ -320,10 +385,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) data->inode->i_ino, sync, (size_t) data->args.count, offset, ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); - /* We can't handle commit to ds yet */ - if (!FILELAYOUT_LSEG(lseg)->commit_through_mds) - data->args.stable = NFS_FILE_SYNC; - data->write_done_cb = filelayout_write_done_cb; data->ds_clp = ds->ds_clp; fh = nfs4_fl_select_ds_fh(lseg, j); @@ -441,12 +502,33 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, struct nfs4_layoutget_res *lgr, struct nfs4_deviceid *id) { - uint32_t *p = (uint32_t *)lgr->layout.buf; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = lgr->layoutp->pages, + .page_len = lgr->layoutp->len, + .buflen = lgr->layoutp->len, + .len = lgr->layoutp->len, + }; + struct page *scratch; + __be32 *p; uint32_t nfl_util; int i; dprintk("%s: set_layout_map Begin\n", __func__); + scratch = alloc_page(GFP_KERNEL); + if (!scratch) + return -ENOMEM; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), + * num_fh (4) */ + p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20); + if (unlikely(!p)) + goto out_err; + memcpy(id, p, sizeof(*id)); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); print_deviceid(id); @@ -468,32 +550,57 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, __func__, nfl_util, fl->num_fh, fl->first_stripe_index, fl->pattern_offset); + if (!fl->num_fh) + goto out_err; + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), GFP_KERNEL); if (!fl->fh_array) - return -ENOMEM; + goto out_err; for (i = 0; i < fl->num_fh; i++) { /* Do we want to use a mempool here? */ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); - if (!fl->fh_array[i]) { - filelayout_free_fh_array(fl); - return -ENOMEM; - } + if (!fl->fh_array[i]) + goto out_err_free; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free; fl->fh_array[i]->size = be32_to_cpup(p++); if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { printk(KERN_ERR "Too big fh %d received %d\n", i, fl->fh_array[i]->size); - filelayout_free_fh_array(fl); - return -EIO; + goto out_err_free; } + + p = xdr_inline_decode(&stream, fl->fh_array[i]->size); + if (unlikely(!p)) + goto out_err_free; memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); - p += XDR_QUADLEN(fl->fh_array[i]->size); dprintk("DEBUG: %s: fh len %d\n", __func__, fl->fh_array[i]->size); } + __free_page(scratch); return 0; + +out_err_free: + filelayout_free_fh_array(fl); +out_err: + __free_page(scratch); + return -EIO; +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + + dprintk("--> %s\n", __func__); + nfs4_fl_put_deviceid(fl->dsaddr); + kfree(fl->commit_buckets); + _filelayout_free_lseg(fl); } static struct pnfs_layout_segment * @@ -514,17 +621,28 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, _filelayout_free_lseg(fl); return NULL; } - return &fl->generic_hdr; -} -static void -filelayout_free_lseg(struct pnfs_layout_segment *lseg) -{ - struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); - - dprintk("--> %s\n", __func__); - nfs4_fl_put_deviceid(fl->dsaddr); - _filelayout_free_lseg(fl); + /* This assumes there is only one IOMODE_RW lseg. What + * we really want to do is have a layout_hdr level + * dictionary of <multipath_list4, fh> keys, each + * associated with a struct list_head, populated by calls + * to filelayout_write_pagelist(). + * */ + if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) { + int i; + int size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + fl->commit_buckets = kcalloc(size, sizeof(struct list_head), GFP_KERNEL); + if (!fl->commit_buckets) { + filelayout_free_lseg(&fl->generic_hdr); + return NULL; + } + fl->number_of_buckets = size; + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&fl->commit_buckets[i]); + } + return &fl->generic_hdr; } /* @@ -552,6 +670,191 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return (p_stripe == r_stripe); } +static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) +{ + return !FILELAYOUT_LSEG(lseg)->commit_through_mds; +} + +static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) +{ + if (fl->stripe_type == STRIPE_SPARSE) + return nfs4_fl_calc_ds_index(&fl->generic_hdr, j); + else + return j; +} + +struct list_head *filelayout_choose_commit_list(struct nfs_page *req) +{ + struct pnfs_layout_segment *lseg = req->wb_commit_lseg; + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + u32 i, j; + struct list_head *list; + + /* Note that we are calling nfs4_fl_calc_j_index on each page + * that ends up being committed to a data server. An attractive + * alternative is to add a field to nfs_write_data and nfs_page + * to store the value calculated in filelayout_write_pagelist + * and just use that here. + */ + j = nfs4_fl_calc_j_index(lseg, + (loff_t)req->wb_index << PAGE_CACHE_SHIFT); + i = select_bucket_index(fl, j); + list = &fl->commit_buckets[i]; + if (list_empty(list)) { + /* Non-empty buckets hold a reference on the lseg */ + get_lseg(lseg); + } + return list; +} + +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) + return i; + else + return nfs4_fl_calc_ds_index(lseg, i); +} + +static struct nfs_fh * +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + } + return flseg->fh_array[i]; +} + +static int filelayout_initiate_commit(struct nfs_write_data *data, int how) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + u32 idx; + struct nfs_fh *fh; + + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + prepare_to_resend_writes(data); + data->mds_ops->rpc_release(data); + return -EAGAIN; + } + dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how); + data->write_done_cb = filelayout_commit_done_cb; + data->ds_clp = ds->ds_clp; + fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + if (fh) + data->args.fh = fh; + return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient, + &filelayout_commit_call_ops, how); +} + +/* + * This is only useful while we are using whole file layouts. + */ +static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) +{ + struct pnfs_layout_segment *lseg, *rv = NULL; + + spin_lock(&inode->i_lock); + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) + if (lseg->pls_range.iomode == IOMODE_RW) + rv = get_lseg(lseg); + spin_unlock(&inode->i_lock); + return rv; +} + +static int alloc_ds_commits(struct inode *inode, struct list_head *list) +{ + struct pnfs_layout_segment *lseg; + struct nfs4_filelayout_segment *fl; + struct nfs_write_data *data; + int i, j; + + /* Won't need this when non-whole file layout segments are supported + * instead we will use a pnfs_layout_hdr structure */ + lseg = find_only_write_lseg(inode); + if (!lseg) + return 0; + fl = FILELAYOUT_LSEG(lseg); + for (i = 0; i < fl->number_of_buckets; i++) { + if (list_empty(&fl->commit_buckets[i])) + continue; + data = nfs_commitdata_alloc(); + if (!data) + goto out_bad; + data->ds_commit_index = i; + data->lseg = lseg; + list_add(&data->pages, list); + } + put_lseg(lseg); + return 0; + +out_bad: + for (j = i; j < fl->number_of_buckets; j++) { + if (list_empty(&fl->commit_buckets[i])) + continue; + nfs_retry_commit(&fl->commit_buckets[i], lseg); + put_lseg(lseg); /* associated with emptying bucket */ + } + put_lseg(lseg); + /* Caller will clean up entries put on list */ + return -ENOMEM; +} + +/* This follows nfs_commit_list pretty closely */ +static int +filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how) +{ + struct nfs_write_data *data, *tmp; + LIST_HEAD(list); + + if (!list_empty(mds_pages)) { + data = nfs_commitdata_alloc(); + if (!data) + goto out_bad; + data->lseg = NULL; + list_add(&data->pages, &list); + } + + if (alloc_ds_commits(inode, &list)) + goto out_bad; + + list_for_each_entry_safe(data, tmp, &list, pages) { + list_del_init(&data->pages); + atomic_inc(&NFS_I(inode)->commits_outstanding); + if (!data->lseg) { + nfs_init_commit(data, mds_pages, NULL); + nfs_initiate_commit(data, NFS_CLIENT(inode), + data->mds_ops, how); + } else { + nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg); + filelayout_initiate_commit(data, how); + } + } + return 0; + out_bad: + list_for_each_entry_safe(data, tmp, &list, pages) { + nfs_retry_commit(&data->pages, data->lseg); + list_del_init(&data->pages); + nfs_commit_free(data); + } + nfs_retry_commit(mds_pages, NULL); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; +} + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", @@ -559,6 +862,9 @@ static struct pnfs_layoutdriver_type filelayout_type = { .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, .pg_test = filelayout_pg_test, + .mark_pnfs_commit = filelayout_mark_pnfs_commit, + .choose_commit_list = filelayout_choose_commit_list, + .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, }; diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index ee0c907742b..085a354e0f0 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -79,6 +79,8 @@ struct nfs4_filelayout_segment { struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ unsigned int num_fh; struct nfs_fh **fh_array; + struct list_head *commit_buckets; /* Sort commits to ds */ + int number_of_buckets; }; static inline struct nfs4_filelayout_segment * diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 68143c162e3..de5350f2b24 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -261,7 +261,7 @@ out: * Currently only support ipv4, and one multi-path address. */ static struct nfs4_pnfs_ds * -decode_and_add_ds(__be32 **pp, struct inode *inode) +decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode) { struct nfs4_pnfs_ds *ds = NULL; char *buf; @@ -269,25 +269,34 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) u32 ip_addr, port; int nlen, rlen, i; int tmp[2]; - __be32 *r_netid, *r_addr, *p = *pp; + __be32 *p; /* r_netid */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_err; nlen = be32_to_cpup(p++); - r_netid = p; - p += XDR_QUADLEN(nlen); - /* r_addr */ - rlen = be32_to_cpup(p++); - r_addr = p; - p += XDR_QUADLEN(rlen); - *pp = p; + p = xdr_inline_decode(streamp, nlen); + if (unlikely(!p)) + goto out_err; /* Check that netid is "tcp" */ - if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) { + if (nlen != 3 || memcmp((char *)p, "tcp", 3)) { dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); goto out_err; } + /* r_addr */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_err; + rlen = be32_to_cpup(p); + + p = xdr_inline_decode(streamp, rlen); + if (unlikely(!p)) + goto out_err; + /* ipv6 length plus port is legal */ if (rlen > INET6_ADDRSTRLEN + 8) { dprintk("%s: Invalid address, length %d\n", __func__, @@ -300,7 +309,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) goto out_err; } buf[rlen] = '\0'; - memcpy(buf, r_addr, rlen); + memcpy(buf, p, rlen); /* replace the port dots with dashes for the in4_pton() delimiter*/ for (i = 0; i < 2; i++) { @@ -336,90 +345,154 @@ out_err: static struct nfs4_file_layout_dsaddr* decode_device(struct inode *ino, struct pnfs_device *pdev) { - int i, dummy; + int i; u32 cnt, num; u8 *indexp; - __be32 *p = (__be32 *)pdev->area, *indicesp; - struct nfs4_file_layout_dsaddr *dsaddr; + __be32 *p; + u8 *stripe_indices; + u8 max_stripe_index; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = pdev->pages, + .page_len = pdev->pglen, + .buflen = pdev->pglen, + .len = pdev->pglen, + }; + struct page *scratch; + + /* set up xdr stream */ + scratch = alloc_page(GFP_KERNEL); + if (!scratch) + goto out_err; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); /* Get the stripe count (number of stripe index) */ - cnt = be32_to_cpup(p++); + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_scratch; + + cnt = be32_to_cpup(p); dprintk("%s stripe count %d\n", __func__, cnt); if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { printk(KERN_WARNING "%s: stripe count %d greater than " "supported maximum %d\n", __func__, cnt, NFS4_PNFS_MAX_STRIPE_CNT); - goto out_err; + goto out_err_free_scratch; + } + + /* read stripe indices */ + stripe_indices = kcalloc(cnt, sizeof(u8), GFP_KERNEL); + if (!stripe_indices) + goto out_err_free_scratch; + + p = xdr_inline_decode(&stream, cnt << 2); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + indexp = &stripe_indices[0]; + max_stripe_index = 0; + for (i = 0; i < cnt; i++) { + *indexp = be32_to_cpup(p++); + max_stripe_index = max(max_stripe_index, *indexp); + indexp++; } /* Check the multipath list count */ - indicesp = p; - p += XDR_QUADLEN(cnt << 2); - num = be32_to_cpup(p++); + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + num = be32_to_cpup(p); dprintk("%s ds_num %u\n", __func__, num); if (num > NFS4_PNFS_MAX_MULTI_CNT) { printk(KERN_WARNING "%s: multipath count %d greater than " "supported maximum %d\n", __func__, num, NFS4_PNFS_MAX_MULTI_CNT); - goto out_err; + goto out_err_free_stripe_indices; } + + /* validate stripe indices are all < num */ + if (max_stripe_index >= num) { + printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n", + __func__, max_stripe_index, num); + goto out_err_free_stripe_indices; + } + dsaddr = kzalloc(sizeof(*dsaddr) + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), GFP_KERNEL); if (!dsaddr) - goto out_err; - - dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); - if (!dsaddr->stripe_indices) - goto out_err_free; + goto out_err_free_stripe_indices; dsaddr->stripe_count = cnt; + dsaddr->stripe_indices = stripe_indices; + stripe_indices = NULL; dsaddr->ds_num = num; memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); - /* Go back an read stripe indices */ - p = indicesp; - indexp = &dsaddr->stripe_indices[0]; - for (i = 0; i < dsaddr->stripe_count; i++) { - *indexp = be32_to_cpup(p++); - if (*indexp >= num) - goto out_err_free; - indexp++; - } - /* Skip already read multipath list count */ - p++; - for (i = 0; i < dsaddr->ds_num; i++) { int j; + u32 mp_count; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; - dummy = be32_to_cpup(p++); /* multipath count */ - if (dummy > 1) { + mp_count = be32_to_cpup(p); /* multipath count */ + if (mp_count > 1) { printk(KERN_WARNING "%s: Multipath count %d not supported, " "skipping all greater than 1\n", __func__, - dummy); + mp_count); } - for (j = 0; j < dummy; j++) { + for (j = 0; j < mp_count; j++) { if (j == 0) { - dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); + dsaddr->ds_list[i] = decode_and_add_ds(&stream, + ino); if (dsaddr->ds_list[i] == NULL) - goto out_err_free; + goto out_err_free_deviceid; } else { u32 len; /* skip extra multipath */ - len = be32_to_cpup(p++); - p += XDR_QUADLEN(len); - len = be32_to_cpup(p++); - p += XDR_QUADLEN(len); - continue; + + /* read len, skip */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + len = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, len); + if (unlikely(!p)) + goto out_err_free_deviceid; + + /* read len, skip */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + len = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, len); + if (unlikely(!p)) + goto out_err_free_deviceid; } } } + + __free_page(scratch); return dsaddr; -out_err_free: +out_err_free_deviceid: nfs4_fl_free_deviceid(dsaddr); + /* stripe_indicies was part of dsaddr */ + goto out_err_free_scratch; +out_err_free_stripe_indices: + kfree(stripe_indices); +out_err_free_scratch: + __free_page(scratch); out_err: dprintk("%s ERROR: returning NULL\n", __func__); return NULL; @@ -498,11 +571,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) goto out_free; } - /* set pdev->area */ - pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); - if (!pdev->area) - goto out_free; - memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); pdev->layout_type = LAYOUT_NFSV4_1_FILES; pdev->pages = pages; @@ -521,8 +589,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) */ dsaddr = decode_and_add_device(inode, pdev); out_free: - if (pdev->area != NULL) - vunmap(pdev->area); for (i = 0; i < max_pages; i++) __free_page(pages[i]); kfree(pages); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1d84e7088af..dfd1e6d7e6c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -41,6 +41,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/gss_api.h> #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> @@ -71,7 +72,9 @@ static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); +static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, @@ -85,6 +88,8 @@ static int nfs4_map_errors(int err) switch (err) { case -NFS4ERR_RESOURCE: return -EREMOTEIO; + case -NFS4ERR_WRONGSEC: + return -EPERM; case -NFS4ERR_BADOWNER: case -NFS4ERR_BADNAME: return -EINVAL; @@ -657,7 +662,8 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static int nfs4_call_sync_sequence(struct nfs_server *server, +static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -673,7 +679,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server, .cache_reply = cache_reply, }; struct rpc_task_setup task_setup = { - .rpc_client = server->client, + .rpc_client = clnt, .rpc_message = msg, .callback_ops = &nfs41_call_sync_ops, .callback_data = &data @@ -692,13 +698,14 @@ static int nfs4_call_sync_sequence(struct nfs_server *server, return ret; } -int _nfs4_call_sync_session(struct nfs_server *server, +int _nfs4_call_sync_session(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { - return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0); } #else @@ -709,19 +716,28 @@ static int nfs4_sequence_done(struct rpc_task *task, } #endif /* CONFIG_NFS_V4_1 */ -int _nfs4_call_sync(struct nfs_server *server, +int _nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { args->sa_session = res->sr_session = NULL; - return rpc_call_sync(server->client, msg, 0); + return rpc_call_sync(clnt, msg, 0); } -#define nfs4_call_sync(server, msg, args, res, cache_reply) \ - (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ - &(res)->seq_res, (cache_reply)) +static inline +int nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply) +{ + return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, + args, res, cache_reply); +} static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) { @@ -1831,7 +1847,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, } else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); - status = nfs4_call_sync(server, &msg, &arg, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status == 0 && state != NULL) renew_lease(server, timestamp); return status; @@ -2090,7 +2106,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f }; int status; - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| @@ -2160,7 +2176,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, }; nfs_fattr_init(info->fattr); - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, @@ -2176,15 +2192,43 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, return err; } +static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, rpc_authflavor_t flavor) +{ + struct rpc_auth *auth; + int ret; + + auth = rpcauth_create(flavor, server->client); + if (!auth) { + ret = -EIO; + goto out; + } + ret = nfs4_lookup_root(server, fhandle, info); + if (ret < 0) + ret = -EAGAIN; +out: + return ret; +} + /* * get the file handle for the "/" directory on the server */ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - int status; + int i, len, status = 0; + rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS + 2]; + + flav_array[0] = RPC_AUTH_UNIX; + len = gss_mech_list_pseudoflavors(&flav_array[1]); + flav_array[1+len] = RPC_AUTH_NULL; + len += 2; - status = nfs4_lookup_root(server, fhandle, info); + for (i = 0; i < len; i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); + if (status == 0) + break; + } if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) @@ -2249,7 +2293,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, }; nfs_fattr_init(fattr); - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) @@ -2309,9 +2353,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, return status; } -static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *dirfh, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) +static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, + const struct nfs_fh *dirfh, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int status; struct nfs4_lookup_arg args = { @@ -2333,7 +2377,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d nfs_fattr_init(fattr); dprintk("NFS call lookupfh %s\n", name->name); - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("NFS reply lookupfh: %d\n", status); return status; } @@ -2345,7 +2389,7 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, struct nfs4_exception exception = { }; int err; do { - err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); + err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr); /* FIXME: !!!! */ if (err == -NFS4ERR_MOVED) { err = -EREMOTE; @@ -2356,27 +2400,41 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, return err; } -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) { int status; dprintk("NFS call lookup %s\n", name->name); - status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); + status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); if (status == -NFS4ERR_MOVED) status = nfs4_get_referral(dir, name, fattr, fhandle); dprintk("NFS reply lookup: %d\n", status); return status; } -static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr, struct nfs_fh *fh) +{ + memset(fh, 0, sizeof(struct nfs_fh)); + fattr->fsid.major = 1; + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_FSID | NFS_ATTR_FATTR_MOUNTPOINT; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + +static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_lookup(dir, name, fhandle, fattr), + _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), &exception); + if (err == -EPERM) + nfs_fixup_secinfo_attributes(fattr, fhandle); } while (exception.retry); return err; } @@ -2421,7 +2479,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry if (res.fattr == NULL) return -ENOMEM; - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { entry->mask = 0; if (res.access & NFS4_ACCESS_READ) @@ -2488,7 +2546,7 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page, .rpc_resp = &res, }; - return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); + return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_readlink(struct inode *inode, struct page *page, @@ -2577,7 +2635,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) if (res.dir_attr == NULL) goto out; - status = nfs4_call_sync(server, &msg, &args, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); if (status == 0) { update_changeattr(dir, &res.cinfo); nfs_post_op_update_inode(dir, res.dir_attr); @@ -2678,7 +2736,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, if (res.old_fattr == NULL || res.new_fattr == NULL) goto out; - status = nfs4_call_sync(server, &msg, &arg, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(old_dir, &res.old_cinfo); nfs_post_op_update_inode(old_dir, res.old_fattr); @@ -2729,7 +2787,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * if (res.fattr == NULL || res.dir_attr == NULL) goto out; - status = nfs4_call_sync(server, &msg, &arg, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo); nfs_post_op_update_inode(dir, res.dir_attr); @@ -2792,8 +2850,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) { - int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg, - &data->arg, &data->res, 1); + int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { update_changeattr(dir, &data->res.dir_cinfo); nfs_post_op_update_inode(dir, data->res.dir_fattr); @@ -2905,7 +2963,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, (unsigned long long)cookie); nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); res.pgbase = args.pgbase; - status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); if (status >= 0) { memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; @@ -2997,7 +3055,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, }; nfs_fattr_init(fsstat->fattr); - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) @@ -3028,7 +3086,7 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) @@ -3073,7 +3131,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle } nfs_fattr_init(pathconf->fattr); - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, @@ -3195,12 +3253,9 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; } -static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); @@ -3210,11 +3265,24 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } +static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + return data->write_done_cb(task, data); +} + static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->inode); - - data->args.bitmask = server->cache_consistency_bitmask; + + if (data->lseg) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } else + data->args.bitmask = server->cache_consistency_bitmask; + if (!data->write_done_cb) + data->write_done_cb = nfs4_commit_done_cb; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; } @@ -3452,7 +3520,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu resp_buf = buf; buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); } - ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); + ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); if (ret) goto out_free; if (res.acl_len > args.acl_len) @@ -3527,7 +3595,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl if (i < 0) return i; nfs_inode_return_delegation(inode); - ret = nfs4_call_sync(server, &msg, &arg, &res, 1); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); /* * Free each page after tx, so the only ref left is @@ -3890,7 +3958,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock lsp = request->fl_u.nfs4_fl.owner; arg.lock_owner.id = lsp->ls_id.id; arg.lock_owner.s_dev = server->s_dev; - status = nfs4_call_sync(server, &msg, &arg, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); switch (status) { case 0: request->fl_type = F_UNLCK; @@ -4618,12 +4686,46 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); nfs_fixup_referral_attributes(&fs_locations->fattr); dprintk("%s: returned status = %d\n", __func__, status); return status; } +static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +{ + int status; + struct nfs4_secinfo_arg args = { + .dir_fh = NFS_FH(dir), + .name = name, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + dprintk("NFS call secinfo %s\n", name->name); + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + dprintk("NFS reply secinfo: %d\n", status); + return status; +} + +int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), + _nfs4_proc_secinfo(dir, name, flavors), + &exception); + } while (exception.retry); + return err; +} + #ifdef CONFIG_NFS_V4_1 /* * Check the exchange flags returned by the server for invalid flags, having @@ -5516,8 +5618,6 @@ static void nfs4_layoutget_release(void *calldata) struct nfs4_layoutget *lgp = calldata; dprintk("--> %s\n", __func__); - if (lgp->res.layout.buf != NULL) - free_page((unsigned long) lgp->res.layout.buf); put_nfs_open_context(lgp->args.ctx); kfree(calldata); dprintk("<-- %s\n", __func__); @@ -5549,12 +5649,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) dprintk("--> %s\n", __func__); - lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); - if (lgp->res.layout.buf == NULL) { - nfs4_layoutget_release(lgp); - return -ENOMEM; - } - + lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -5586,7 +5681,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) int status; dprintk("--> %s\n", __func__); - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -5606,6 +5701,100 @@ int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) } EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); +static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (nfs4_setup_sequence(server, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} + +static void +nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { /* Just ignore these failures */ + case NFS4ERR_DELEG_REVOKED: /* layout was recalled */ + case NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */ + case NFS4ERR_BADLAYOUT: /* no layout */ + case NFS4ERR_GRACE: /* loca_recalim always false */ + task->tk_status = 0; + } + + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + nfs_restart_rpc(task, server->nfs_client); + return; + } + + if (task->tk_status == 0) + nfs_post_op_update_inode_force_wcc(data->args.inode, + data->res.fattr); +} + +static void nfs4_layoutcommit_release(void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + + /* Matched by references in pnfs_set_layoutcommit */ + put_lseg(data->lseg); + put_rpccred(data->cred); + kfree(data); +} + +static const struct rpc_call_ops nfs4_layoutcommit_ops = { + .rpc_call_prepare = nfs4_layoutcommit_prepare, + .rpc_call_done = nfs4_layoutcommit_done, + .rpc_release = nfs4_layoutcommit_release, +}; + +int +nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = NFS_CLIENT(data->args.inode), + .rpc_message = &msg, + .callback_ops = &nfs4_layoutcommit_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + int status = 0; + + dprintk("NFS: %4d initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", + data->task.tk_pid, sync, + data->args.lastbytewritten, + data->args.inode->i_ino); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (sync == false) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = task->tk_status; +out: + dprintk("%s: status %d\n", __func__, status); + rpc_put_task(task); + return status; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -5741,6 +5930,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, .init_client = nfs4_init_client, + .secinfo = nfs4_proc_secinfo, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ab1bf5bb021..a6804f704d9 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -590,7 +590,8 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) state->owner = owner; atomic_inc(&owner->so_count); list_add(&state->inode_states, &nfsi->open_states); - state->inode = igrab(inode); + ihold(inode); + state->inode = inode; spin_unlock(&inode->i_lock); /* Note: The reclaim code dictates that we add stateless * and read-only stateids to the end of the list */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0cf560f7788..dddfb5795d7 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -46,6 +46,7 @@ #include <linux/kdev_t.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/msg_prot.h> +#include <linux/sunrpc/gss_api.h> #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> @@ -112,7 +113,7 @@ static int nfs4_stat_to_errno(int); #define encode_restorefh_maxsz (op_encode_hdr_maxsz) #define decode_restorefh_maxsz (op_decode_hdr_maxsz) #define encode_fsinfo_maxsz (encode_getattr_maxsz) -#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ @@ -253,6 +254,8 @@ static int nfs4_stat_to_errno(int); (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ (0) +#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN))) #if defined(CONFIG_NFS_V4_1) #define NFS4_MAX_MACHINE_NAME_LEN (64) @@ -324,6 +327,18 @@ static int nfs4_stat_to_errno(int); #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ decode_stateid_maxsz + \ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) +#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + 1 /* reclaim */ + \ + encode_stateid_maxsz + \ + 1 /* new offset (true) */ + \ + 2 /* last byte written */ + \ + 1 /* nt_timechanged (false) */ + \ + 1 /* layoutupdate4 layout type */ + \ + 1 /* NULL filelayout layoutupdate4 payload */) +#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) + #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -676,6 +691,14 @@ static int nfs4_stat_to_errno(int); decode_putfh_maxsz + \ decode_lookup_maxsz + \ decode_fs_locations_maxsz) +#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_secinfo_maxsz) +#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_secinfo_maxsz) #if defined(CONFIG_NFS_V4_1) #define NFS4_enc_exchange_id_sz \ (compound_encode_hdr_maxsz + \ @@ -727,6 +750,17 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_layoutget_maxsz) +#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_putfh_maxsz + \ + encode_layoutcommit_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutcommit_maxsz + \ + decode_getattr_maxsz) + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1620,6 +1654,18 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state hdr->replen += decode_delegreturn_maxsz; } +static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + int len = name->len; + __be32 *p; + + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_SECINFO); + xdr_encode_opaque(p, name->name, len); + hdr->nops++; + hdr->replen += decode_secinfo_maxsz; +} + #if defined(CONFIG_NFS_V4_1) /* NFSv4.1 operations */ static void encode_exchange_id(struct xdr_stream *xdr, @@ -1816,6 +1862,34 @@ encode_layoutget(struct xdr_stream *xdr, hdr->nops++; hdr->replen += decode_layoutget_maxsz; } + +static int +encode_layoutcommit(struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, + NFS_SERVER(args->inode)->pnfs_curr_ld->id); + + p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); + /* Only whole file layouts */ + p = xdr_encode_hyper(p, 0); /* offset */ + p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ + *p++ = cpu_to_be32(0); /* reclaim */ + p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ + *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ + *p++ = cpu_to_be32(0); /* no file layout payload */ + + hdr->nops++; + hdr->replen += decode_layoutcommit_maxsz; + return 0; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2294,7 +2368,8 @@ static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_commit(xdr, args, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2465,6 +2540,24 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode SECINFO request + */ +static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_secinfo_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_secinfo(xdr, args->name, &hdr); + encode_nops(&hdr); +} + #if defined(CONFIG_NFS_V4_1) /* * EXCHANGE_ID request @@ -2604,8 +2697,32 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, NFS_FH(args->inode), &hdr); encode_layoutget(xdr, args, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->layout.pages, 0, args->layout.pglen); + encode_nops(&hdr); } + +/* + * Encode LAYOUTCOMMIT request + */ +static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutcommit(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -2925,6 +3042,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) if (unlikely(!p)) goto out_overflow; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; + return -be32_to_cpup(p); } return 0; out_overflow: @@ -3912,6 +4030,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, fattr->valid |= status; status = decode_attr_error(xdr, bitmap); + if (status == -NFS4ERR_WRONGSEC) { + nfs_fixup_secinfo_attributes(fattr, fh); + status = 0; + } if (status < 0) goto xdr_error; @@ -4680,6 +4802,73 @@ static int decode_delegreturn(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_DELEGRETURN); } +static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + flavor->gss.sec_oid4.len = be32_to_cpup(p); + if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN) + goto out_err; + + p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len); + if (unlikely(!p)) + goto out_overflow; + memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len); + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + flavor->gss.qop4 = be32_to_cpup(p++); + flavor->gss.service = be32_to_cpup(p); + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +out_err: + return -EINVAL; +} + +static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + struct nfs4_secinfo_flavor *sec_flavor; + int status; + __be32 *p; + int i; + + status = decode_op_hdr(xdr, OP_SECINFO); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->flavors->num_flavors = be32_to_cpup(p); + + for (i = 0; i < res->flavors->num_flavors; i++) { + sec_flavor = &res->flavors->flavors[i]; + if ((char *)&sec_flavor[1] - (char *)res > PAGE_SIZE) + break; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sec_flavor->flavor = be32_to_cpup(p); + + if (sec_flavor->flavor == RPC_AUTH_GSS) { + if (decode_secinfo_gss(xdr, sec_flavor)) + break; + } + } + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + #if defined(CONFIG_NFS_V4_1) static int decode_exchange_id(struct xdr_stream *xdr, struct nfs41_exchange_id_res *res) @@ -4950,6 +5139,9 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, __be32 *p; int status; u32 layout_count; + struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + struct kvec *iov = rcvbuf->head; + u32 hdrlen, recvd; status = decode_op_hdr(xdr, OP_LAYOUTGET); if (status) @@ -4966,17 +5158,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, return -EINVAL; } - p = xdr_inline_decode(xdr, 24); + p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) goto out_overflow; p = xdr_decode_hyper(p, &res->range.offset); p = xdr_decode_hyper(p, &res->range.length); res->range.iomode = be32_to_cpup(p++); res->type = be32_to_cpup(p++); - - status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); - if (unlikely(status)) - return status; + res->layoutp->len = be32_to_cpup(p); dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", __func__, @@ -4984,12 +5173,18 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, (unsigned long)res->range.length, res->range.iomode, res->type, - res->layout.len); + res->layoutp->len); - /* nfs4_proc_layoutget allocated a single page */ - if (res->layout.len > PAGE_SIZE) - return -ENOMEM; - memcpy(res->layout.buf, p, res->layout.len); + hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base; + recvd = req->rq_rcv_buf.len - hdrlen; + if (res->layoutp->len > recvd) { + dprintk("NFS: server cheating in layoutget reply: " + "layout len %u > recvd %u\n", + res->layoutp->len, recvd); + return -EINVAL; + } + + xdr_read_pages(xdr, res->layoutp->len); if (layout_count > 1) { /* We only handle a length one array at the moment. Any @@ -5006,6 +5201,35 @@ out_overflow: print_overflow_msg(__func__, xdr); return -EIO; } + +static int decode_layoutcommit(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct nfs4_layoutcommit_res *res) +{ + __be32 *p; + __u32 sizechanged; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sizechanged = be32_to_cpup(p); + + if (sizechanged) { + /* throw away new size */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -5723,8 +5947,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_commit(xdr, res); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5919,6 +6144,32 @@ out: return status; } +/* + * Decode SECINFO response + */ +static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_secinfo(xdr, res); + if (status) + goto out; +out: + return status; +} + #if defined(CONFIG_NFS_V4_1) /* * Decode EXCHANGE_ID response @@ -6066,6 +6317,34 @@ static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode LAYOUTCOMMIT response + */ +static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutcommit(xdr, rqstp, res); + if (status) + goto out; + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ /** @@ -6180,10 +6459,6 @@ static struct { { NFS4ERR_SYMLINK, -ELOOP }, { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, { NFS4ERR_DEADLOCK, -EDEADLK }, - { NFS4ERR_WRONGSEC, -EPERM }, /* FIXME: this needs - * to be handled by a - * middle-layer. - */ { -1, -EIO } }; @@ -6258,6 +6533,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SETACL, enc_setacl, dec_setacl), PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + PROC(SECINFO, enc_secinfo, dec_secinfo), #if defined(CONFIG_NFS_V4_1) PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), PROC(CREATE_SESSION, enc_create_session, dec_create_session), @@ -6267,6 +6543,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), PROC(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 23e79441066..c80add6e221 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -135,14 +135,14 @@ void nfs_clear_page_tag_locked(struct nfs_page *req) nfs_unlock_request(req); } -/** +/* * nfs_clear_request - Free up all resources allocated to the request * @req: * * Release page and open context resources associated with a read/write * request after it has completed. */ -void nfs_clear_request(struct nfs_page *req) +static void nfs_clear_request(struct nfs_page *req) { struct page *page = req->wb_page; struct nfs_open_context *ctx = req->wb_context; @@ -223,6 +223,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_count = 0; desc->pg_bsize = bsize; desc->pg_base = 0; + desc->pg_moreio = 0; desc->pg_inode = inode; desc->pg_doio = doio; desc->pg_ioflags = io_flags; @@ -335,9 +336,11 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { while (!nfs_pageio_do_add_request(desc, req)) { + desc->pg_moreio = 1; nfs_pageio_doio(desc); if (desc->pg_error < 0) return 0; + desc->pg_moreio = 0; } return 1; } @@ -395,6 +398,7 @@ int nfs_scan_list(struct nfs_inode *nfsi, pgoff_t idx_end; int found, i; int res; + struct list_head *list; res = 0; if (npages == 0) @@ -415,10 +419,10 @@ int nfs_scan_list(struct nfs_inode *nfsi, idx_start = req->wb_index + 1; if (nfs_set_page_tag_locked(req)) { kref_get(&req->wb_kref); - nfs_list_remove_request(req); radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, tag); - nfs_list_add_request(req, dst); + list = pnfs_choose_commit_list(req, dst); + nfs_list_add_request(req, list); res++; if (res == INT_MAX) goto out; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f38813a0a29..d9ab97269ce 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -259,6 +259,7 @@ put_lseg(struct pnfs_layout_segment *lseg) pnfs_free_lseg_list(&free_me); } } +EXPORT_SYMBOL_GPL(put_lseg); static bool should_free_lseg(u32 lseg_iomode, u32 recall_iomode) @@ -471,6 +472,9 @@ send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; struct pnfs_layout_segment *lseg = NULL; + struct page **pages = NULL; + int i; + u32 max_resp_sz, max_pages; dprintk("--> %s\n", __func__); @@ -478,6 +482,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); if (lgp == NULL) return NULL; + + /* allocate pages for xdr post processing */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_err_free; + + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_err_free; + } + lgp->args.minlength = NFS4_MAX_UINT64; lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; lgp->args.range.iomode = iomode; @@ -486,6 +505,8 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); + lgp->args.layout.pages = pages; + lgp->args.layout.pglen = max_pages * PAGE_SIZE; lgp->lsegpp = &lseg; /* Synchronously retrieve layout information from server and @@ -496,7 +517,26 @@ send_layoutget(struct pnfs_layout_hdr *lo, /* remember that LAYOUTGET failed and suspend trying */ set_bit(lo_fail_bit(iomode), &lo->plh_flags); } + + /* free xdr pages */ + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + return lseg; + +out_err_free: + /* free any allocated xdr pages, lgp as it's not used */ + if (pages) { + for (i = 0; i < max_pages; i++) { + if (!pages[i]) + break; + __free_page(pages[i]); + } + kfree(pages); + } + kfree(lgp); + return NULL; } bool pnfs_roc(struct inode *ino) @@ -945,3 +985,105 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); return trypnfs; } + +/* + * Currently there is only one (whole file) write lseg. + */ +static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) +{ + struct pnfs_layout_segment *lseg, *rv = NULL; + + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) + if (lseg->pls_range.iomode == IOMODE_RW) + rv = lseg; + return rv; +} + +void +pnfs_set_layoutcommit(struct nfs_write_data *wdata) +{ + struct nfs_inode *nfsi = NFS_I(wdata->inode); + loff_t end_pos = wdata->args.offset + wdata->res.count; + + spin_lock(&nfsi->vfs_inode.i_lock); + if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + /* references matched in nfs4_layoutcommit_release */ + get_lseg(wdata->lseg); + wdata->lseg->pls_lc_cred = + get_rpccred(wdata->args.context->state->owner->so_cred); + mark_inode_dirty_sync(wdata->inode); + dprintk("%s: Set layoutcommit for inode %lu ", + __func__, wdata->inode->i_ino); + } + if (end_pos > wdata->lseg->pls_end_pos) + wdata->lseg->pls_end_pos = end_pos; + spin_unlock(&nfsi->vfs_inode.i_lock); +} +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); + +/* + * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and + * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough + * data to disk to allow the server to recover the data if it crashes. + * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag + * is off, and a COMMIT is sent to a data server, or + * if WRITEs to a data server return NFS_DATA_SYNC. + */ +int +pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + struct nfs4_layoutcommit_data *data; + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg; + struct rpc_cred *cred; + loff_t end_pos; + int status = 0; + + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + + if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return 0; + + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + mark_inode_dirty_sync(inode); + status = -ENOMEM; + goto out; + } + + spin_lock(&inode->i_lock); + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + spin_unlock(&inode->i_lock); + kfree(data); + goto out; + } + /* + * Currently only one (whole file) write lseg which is referenced + * in pnfs_set_layoutcommit and will be found. + */ + lseg = pnfs_list_write_lseg(inode); + + end_pos = lseg->pls_end_pos; + cred = lseg->pls_lc_cred; + lseg->pls_end_pos = 0; + lseg->pls_lc_cred = NULL; + + memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, + sizeof(nfsi->layout->plh_stateid.data)); + spin_unlock(&inode->i_lock); + + data->args.inode = inode; + data->lseg = lseg; + data->cred = cred; + nfs_fattr_init(&data->fattr); + data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; + data->res.fattr = &data->fattr; + data->args.lastbytewritten = end_pos - 1; + data->res.server = NFS_SERVER(inode); + + status = nfs4_proc_layoutcommit(data, sync); +out: + dprintk("<-- %s status %d\n", __func__, status); + return status; +} diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 6380b9405bc..bc4827202e7 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -43,6 +43,8 @@ struct pnfs_layout_segment { atomic_t pls_refcount; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; + struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */ + loff_t pls_end_pos; /* LAYOUTCOMMIT write end */ }; enum pnfs_try_status { @@ -74,6 +76,13 @@ struct pnfs_layoutdriver_type { /* test for nfs page cache coalescing */ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + /* Returns true if layoutdriver wants to divert this request to + * driver's commit routine. + */ + bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg); + struct list_head * (*choose_commit_list) (struct nfs_page *req); + int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how); + /* * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS @@ -100,7 +109,6 @@ struct pnfs_device { unsigned int layout_type; unsigned int mincount; struct page **pages; - void *area; unsigned int pgbase; unsigned int pglen; }; @@ -145,7 +153,8 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier); - +void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +int pnfs_layoutcommit_inode(struct inode *inode, bool sync); static inline int lo_fail_bit(u32 iomode) { @@ -169,6 +178,51 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss) return nfss->pnfs_curr_ld != NULL; } +static inline void +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ + if (lseg) { + struct pnfs_layoutdriver_type *ld; + + ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld; + if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) { + set_bit(PG_PNFS_COMMIT, &req->wb_flags); + req->wb_commit_lseg = get_lseg(lseg); + } + } +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +{ + if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags)) + return PNFS_NOT_ATTEMPTED; + return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how); +} + +static inline struct list_head * +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +{ + struct list_head *rv; + + if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) { + struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode; + + set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); + rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req); + /* matched by ref taken when PG_PNFS_COMMIT is set */ + put_lseg(req->wb_commit_lseg); + } else + rv = mds; + return rv; +} + +static inline void pnfs_clear_request_commit(struct nfs_page *req) +{ + if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) + put_lseg(req->wb_commit_lseg); +} + #else /* CONFIG_NFS_V4_1 */ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -252,6 +306,31 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino) pgio->pg_test = NULL; } +static inline void +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline struct list_head * +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +{ + return mds; +} + +static inline void pnfs_clear_request_commit(struct nfs_page *req) +{ +} + +static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + return 0; +} #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b8ec170f2a0..ac40b8535d7 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -177,7 +177,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct inode *dir, struct qstr *name, +nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_diropargs arg = { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 47a3ad63e0d..af0c6279a4a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void) } return p; } +EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); void nfs_commit_free(struct nfs_write_data *p) { @@ -66,6 +67,7 @@ void nfs_commit_free(struct nfs_write_data *p) kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } +EXPORT_SYMBOL_GPL(nfs_commit_free); struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { @@ -179,8 +181,8 @@ static int wb_priority(struct writeback_control *wbc) if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; if (wbc->for_kupdate || wbc->for_background) - return FLUSH_LOWPRI; - return 0; + return FLUSH_LOWPRI | FLUSH_COND_STABLE; + return FLUSH_COND_STABLE; } /* @@ -387,11 +389,8 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) spin_lock(&inode->i_lock); error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); BUG_ON(error); - if (!nfsi->npages) { - igrab(inode); - if (nfs_have_delegation(inode, FMODE_WRITE)) - nfsi->change_attr++; - } + if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) + nfsi->change_attr++; set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); @@ -421,11 +420,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) clear_bit(PG_MAPPED, &req->wb_flags); radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); nfsi->npages--; - if (!nfsi->npages) { - spin_unlock(&inode->i_lock); - iput(inode); - } else - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); nfs_release_request(req); } @@ -441,7 +436,7 @@ nfs_mark_request_dirty(struct nfs_page *req) * Add a request to the inode's commit list. */ static void -nfs_mark_request_commit(struct nfs_page *req) +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); @@ -453,6 +448,7 @@ nfs_mark_request_commit(struct nfs_page *req) NFS_PAGE_TAG_COMMIT); nfsi->ncommit++; spin_unlock(&inode->i_lock); + pnfs_mark_request_commit(req, lseg); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -474,14 +470,18 @@ nfs_clear_request_commit(struct nfs_page *req) static inline int nfs_write_need_commit(struct nfs_write_data *data) { - return data->verf.committed != NFS_FILE_SYNC; + if (data->verf.committed == NFS_DATA_SYNC) + return data->lseg == NULL; + else + return data->verf.committed != NFS_FILE_SYNC; } static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) { if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, data->lseg); return 1; } if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { @@ -492,7 +492,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req) } #else static inline void -nfs_mark_request_commit(struct nfs_page *req) +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { } @@ -509,7 +509,8 @@ int nfs_write_need_commit(struct nfs_write_data *data) } static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) { return 0; } @@ -612,9 +613,11 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, } if (nfs_clear_request_commit(req) && - radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, - req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { NFS_I(inode)->ncommit--; + pnfs_clear_request_commit(req); + } /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { @@ -762,11 +765,12 @@ int nfs_updatepage(struct file *file, struct page *page, return status; } -static void nfs_writepage_release(struct nfs_page *req) +static void nfs_writepage_release(struct nfs_page *req, + struct nfs_write_data *data) { struct page *page = req->wb_page; - if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) nfs_inode_remove_request(req); nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); @@ -863,7 +867,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; - if (how & FLUSH_STABLE) { + if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { data->args.stable = NFS_DATA_SYNC; if (!nfs_need_commit(NFS_I(inode))) data->args.stable = NFS_FILE_SYNC; @@ -912,6 +916,12 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) nfs_list_remove_request(req); + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit || + desc->pg_count > wsize)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -1002,6 +1012,10 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) if ((!lseg) && list_is_singular(&data->pages)) lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + /* Set up the argument struct */ ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); out: @@ -1074,7 +1088,7 @@ static void nfs_writeback_release_partial(void *calldata) out: if (atomic_dec_and_test(&req->wb_complete)) - nfs_writepage_release(req); + nfs_writepage_release(req, data); nfs_writedata_release(calldata); } @@ -1141,7 +1155,7 @@ static void nfs_writeback_release_full(void *calldata) if (nfs_write_need_commit(data)) { memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, data->lseg); dprintk(" marked for commit\n"); goto next; } @@ -1251,57 +1265,82 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) { + int ret; + if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) return 1; - if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags, - NFS_INO_COMMIT, nfs_wait_bit_killable, - TASK_KILLABLE)) - return 1; - return 0; + if (!may_wait) + return 0; + ret = out_of_line_wait_on_bit_lock(&nfsi->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + return (ret < 0) ? ret : 1; } -static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +void nfs_commit_clear_lock(struct nfs_inode *nfsi) { clear_bit(NFS_INO_COMMIT, &nfsi->flags); smp_mb__after_clear_bit(); wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } +EXPORT_SYMBOL_GPL(nfs_commit_clear_lock); - -static void nfs_commitdata_release(void *data) +void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; + put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); nfs_commit_free(wdata); } +EXPORT_SYMBOL_GPL(nfs_commitdata_release); -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_commit_rpcsetup(struct list_head *head, - struct nfs_write_data *data, - int how) +int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) { - struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = first->wb_context->path.dentry->d_inode; - int priority = flush_task_priority(how); struct rpc_task *task; + int priority = flush_task_priority(how); struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = first->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { .task = &data->task, - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .rpc_message = &msg, - .callback_ops = &nfs_commit_ops, + .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, .priority = priority, }; + /* Set up the initial task struct. */ + NFS_PROTO(data->inode)->commit_setup(data, &msg); + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_initiate_commit); + +/* + * Set up the argument/result storage required for the RPC call. + */ +void nfs_init_commit(struct nfs_write_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg) +{ + struct nfs_page *first = nfs_list_entry(head->next); + struct inode *inode = first->wb_context->path.dentry->d_inode; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1309,7 +1348,9 @@ static int nfs_commit_rpcsetup(struct list_head *head, list_splice_init(head, &data->pages); data->inode = inode; - data->cred = msg.rpc_cred; + data->cred = first->wb_context->cred; + data->lseg = lseg; /* reference transferred */ + data->mds_ops = &nfs_commit_ops; data->args.fh = NFS_FH(data->inode); /* Note: we always request a commit of the entire inode */ @@ -1320,20 +1361,25 @@ static int nfs_commit_rpcsetup(struct list_head *head, data->res.fattr = &data->fattr; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); +} +EXPORT_SYMBOL_GPL(nfs_init_commit); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->commit_setup(data, &msg); - - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg) +{ + struct nfs_page *req; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - if (how & FLUSH_SYNC) - rpc_wait_for_completion_task(task); - rpc_put_task(task); - return 0; + while (!list_empty(page_list)) { + req = nfs_list_entry(page_list->next); + nfs_list_remove_request(req); + nfs_mark_request_commit(req, lseg); + dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); + nfs_clear_page_tag_locked(req); + } } +EXPORT_SYMBOL_GPL(nfs_retry_commit); /* * Commit dirty pages @@ -1342,7 +1388,6 @@ static int nfs_commit_list(struct inode *inode, struct list_head *head, int how) { struct nfs_write_data *data; - struct nfs_page *req; data = nfs_commitdata_alloc(); @@ -1350,17 +1395,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) goto out_bad; /* Set up the argument struct */ - return nfs_commit_rpcsetup(head, data, how); + nfs_init_commit(data, head, NULL); + return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); out_bad: - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_mark_request_commit(req); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); - nfs_clear_page_tag_locked(req); - } + nfs_retry_commit(head, NULL); nfs_commit_clear_lock(NFS_I(inode)); return -ENOMEM; } @@ -1380,10 +1418,9 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) return; } -static void nfs_commit_release(void *calldata) +void nfs_commit_release_pages(struct nfs_write_data *data) { - struct nfs_write_data *data = calldata; - struct nfs_page *req; + struct nfs_page *req; int status = data->task.tk_status; while (!list_empty(&data->pages)) { @@ -1417,6 +1454,14 @@ static void nfs_commit_release(void *calldata) next: nfs_clear_page_tag_locked(req); } +} +EXPORT_SYMBOL_GPL(nfs_commit_release_pages); + +static void nfs_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + + nfs_commit_release_pages(data); nfs_commit_clear_lock(NFS_I(data->inode)); nfs_commitdata_release(calldata); } @@ -1433,23 +1478,30 @@ int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); int may_wait = how & FLUSH_SYNC; - int res = 0; + int res; - if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + res = nfs_commit_set_lock(NFS_I(inode), may_wait); + if (res <= 0) goto out_mark_dirty; spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); if (res) { - int error = nfs_commit_list(inode, &head, how); + int error; + + error = pnfs_commit_list(inode, &head, how); + if (error == PNFS_NOT_ATTEMPTED) + error = nfs_commit_list(inode, &head, how); if (error < 0) return error; - if (may_wait) - wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - else + if (!may_wait) goto out_mark_dirty; + error = wait_on_bit(&NFS_I(inode)->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + if (error < 0) + return error; } else nfs_commit_clear_lock(NFS_I(inode)); return res; @@ -1503,7 +1555,22 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return nfs_commit_unstable_pages(inode, wbc); + int ret; + + ret = nfs_commit_unstable_pages(inode, wbc); + if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { + int status; + bool sync = true; + + if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || + wbc->for_background) + sync = false; + + status = pnfs_layoutcommit_inode(inode, sync); + if (status < 0) + return status; + } + return ret; } /* diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index 84c27d69d42..ec0f277be7f 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -117,7 +117,6 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, * invoked in contexts where a memory allocation failure is * fatal. Fortunately this fake ACL is small enough to * construct on the stack. */ - memset(acl2, 0, sizeof(acl2)); posix_acl_init(acl2, 4); /* Insert entries in canonical order: other orders seem diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 4c29fcf557d..07ea8d3e6ea 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -22,13 +22,14 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/spinlock.h> -#include <linux/writeback.h> /* for inode_lock */ #include <asm/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" +#include "../internal.h" + /* * Recalculate the mask of events relevant to a given inode locked. */ @@ -237,15 +238,14 @@ out: * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @list: list of inodes being unmounted (sb->s_inodes) * - * Called with inode_lock held, protecting the unmounting super block's list - * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. - * We temporarily drop inode_lock, however, and CAN block. + * Called during unmount with no locks held, so needs to be safe against + * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. */ void fsnotify_unmount_inodes(struct list_head *list) { struct inode *inode, *next_i, *need_iput = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry_safe(inode, next_i, list, i_sb_list) { struct inode *need_iput_tmp; @@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list) * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); continue; + } /* * If i_count is zero, the inode cannot have any watches and @@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list) * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. */ - if (!atomic_read(&inode->i_count)) + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); continue; + } need_iput_tmp = need_iput; need_iput = NULL; @@ -274,22 +279,25 @@ void fsnotify_unmount_inodes(struct list_head *list) __iget(inode); else need_iput_tmp = NULL; + spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ if ((&next_i->i_sb_list != list) && - atomic_read(&next_i->i_count) && - !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { - __iget(next_i); - need_iput = next_i; + atomic_read(&next_i->i_count)) { + spin_lock(&next_i->i_lock); + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + spin_unlock(&next_i->i_lock); } /* - * We can safely drop inode_lock here because we hold + * We can safely drop inode_sb_list_lock here because we hold * references on both inode and next_i. Also no new inodes - * will be added since the umount has begun. Finally, - * iprune_mutex keeps shrink_icache_memory() away. + * will be added since the umount has begun. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); if (need_iput_tmp) iput(need_iput_tmp); @@ -301,7 +309,7 @@ void fsnotify_unmount_inodes(struct list_head *list) iput(inode); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 325185e514b..50c00856f73 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -91,7 +91,6 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> -#include <linux/writeback.h> /* for inode_lock */ #include <asm/atomic.h> diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 85eebff6d0d..e86577d6c5c 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -23,7 +23,6 @@ #include <linux/mount.h> #include <linux/mutex.h> #include <linux/spinlock.h> -#include <linux/writeback.h> /* for inode_lock */ #include <asm/atomic.h> diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index a627ed82c0a..0b56c6b7ec0 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -54,7 +54,7 @@ * * Return 1 if the attributes match and 0 if not. * - * NOTE: This function runs with the inode_lock spin lock held so it is not + * NOTE: This function runs with the inode->i_lock spin lock held so it is not * allowed to sleep. */ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) @@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) * * Return 0 on success and -errno on error. * - * NOTE: This function runs with the inode_lock spin lock held so it is not + * NOTE: This function runs with the inode->i_lock spin lock held so it is not * allowed to sleep. (Hence the GFP_ATOMIC allocation.) */ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 90f2729b7a5..e913ad130fd 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -24,7 +24,6 @@ #include <linux/slab.h> #include <linux/string.h> -#define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> #include "ocfs2.h" diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e4984e259cb..b27a0d86f8c 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -30,7 +30,6 @@ #include <linux/swap.h> #include <linux/quotaops.h> -#define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> #include "ocfs2.h" @@ -50,6 +49,7 @@ #include "uptodate.h" #include "xattr.h" #include "refcounttree.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -886,8 +886,7 @@ static int ocfs2_validate_extent_block(struct super_block *sb, struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)bh->b_data; - mlog(0, "Validating extent block %llu\n", - (unsigned long long)bh->b_blocknr); + trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); @@ -965,8 +964,6 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb, struct buffer_head *eb_bh = NULL; u64 last_eb_blk = 0; - mlog_entry_void(); - el = et->et_root_el; last_eb_blk = ocfs2_et_get_last_eb_blk(et); @@ -987,7 +984,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb, bail: brelse(eb_bh); - mlog_exit(retval); + trace_ocfs2_num_free_extents(retval); return retval; } @@ -1010,8 +1007,6 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); struct ocfs2_extent_block *eb; - mlog_entry_void(); - count = 0; while (count < wanted) { status = ocfs2_claim_metadata(handle, @@ -1074,8 +1069,8 @@ bail: brelse(bhs[i]); bhs[i] = NULL; } + mlog_errno(status); } - mlog_exit(status); return status; } @@ -1173,8 +1168,6 @@ static int ocfs2_add_branch(handle_t *handle, struct ocfs2_extent_list *el; u32 new_cpos, root_end; - mlog_entry_void(); - BUG_ON(!last_eb_bh || !*last_eb_bh); if (eb_bh) { @@ -1200,8 +1193,11 @@ static int ocfs2_add_branch(handle_t *handle, * from new_cpos). */ if (root_end > new_cpos) { - mlog(0, "adjust the cluster end from %u to %u\n", - root_end, new_cpos); + trace_ocfs2_adjust_rightmost_branch( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + root_end, new_cpos); + status = ocfs2_adjust_rightmost_branch(handle, et); if (status) { mlog_errno(status); @@ -1332,7 +1328,6 @@ bail: kfree(new_eb_bhs); } - mlog_exit(status); return status; } @@ -1353,8 +1348,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle, struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; - mlog_entry_void(); - status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, &new_eb_bh); if (status < 0) { @@ -1415,7 +1408,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle, bail: brelse(new_eb_bh); - mlog_exit(status); return status; } @@ -1446,8 +1438,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, struct buffer_head *bh = NULL; struct buffer_head *lowest_bh = NULL; - mlog_entry_void(); - *target_bh = NULL; el = et->et_root_el; @@ -1503,7 +1493,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, bail: brelse(bh); - mlog_exit(status); return status; } @@ -1540,7 +1529,10 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, * another tree level */ if (shift) { BUG_ON(bh); - mlog(0, "need to shift tree depth (current = %d)\n", depth); + trace_ocfs2_grow_tree( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + depth); /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to @@ -1570,7 +1562,6 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ - mlog(0, "add branch. bh = %p\n", bh); ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, meta_ac); if (ret < 0) { @@ -1645,8 +1636,9 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, } insert_index = i; - mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", - insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); + trace_ocfs2_rotate_leaf(insert_cpos, insert_index, + has_empty, next_free, + le16_to_cpu(el->l_count)); BUG_ON(insert_index < 0); BUG_ON(insert_index >= le16_to_cpu(el->l_count)); @@ -2059,7 +2051,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, left_el = path_leaf_el(left_path); right_el = path_leaf_el(right_path); for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { - mlog(0, "Adjust records at index %u\n", i); + trace_ocfs2_complete_edge_insert(i); /* * One nice property of knowing that all of these @@ -2389,7 +2381,9 @@ static int ocfs2_rotate_tree_right(handle_t *handle, goto out; } - mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); + trace_ocfs2_rotate_tree_right( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos); /* * What we want to do here is: @@ -2418,8 +2412,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle, * rotating subtrees. */ while (cpos && insert_cpos <= cpos) { - mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", - insert_cpos, cpos); + trace_ocfs2_rotate_tree_right( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos); ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { @@ -2461,10 +2457,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle, start = ocfs2_find_subtree_root(et, left_path, right_path); - mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", - start, - (unsigned long long) right_path->p_node[start].bh->b_blocknr, - right_path->p_tree_depth); + trace_ocfs2_rotate_subtree(start, + (unsigned long long) + right_path->p_node[start].bh->b_blocknr, + right_path->p_tree_depth); ret = ocfs2_extend_rotate_transaction(handle, start, orig_credits, right_path); @@ -2964,8 +2960,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, subtree_root = ocfs2_find_subtree_root(et, left_path, right_path); - mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", - subtree_root, + trace_ocfs2_rotate_subtree(subtree_root, (unsigned long long) right_path->p_node[subtree_root].bh->b_blocknr, right_path->p_tree_depth); @@ -3989,9 +3984,11 @@ static int ocfs2_append_rec_to_path(handle_t *handle, goto out; } - mlog(0, "Append may need a left path update. cpos: %u, " - "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), - left_cpos); + trace_ocfs2_append_rec_to_path( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + le32_to_cpu(insert_rec->e_cpos), + left_cpos); /* * No need to worry if the append is already in the @@ -4562,7 +4559,7 @@ static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et, ocfs2_et_get_last_eb_blk(et), &bh); if (ret) { - mlog_exit(ret); + mlog_errno(ret); goto out; } eb = (struct ocfs2_extent_block *) bh->b_data; @@ -4678,9 +4675,9 @@ int ocfs2_insert_extent(handle_t *handle, struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; - mlog(0, "add %u clusters at position %u to owner %llu\n", - new_clusters, cpos, - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); + trace_ocfs2_insert_extent_start( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, new_clusters); memset(&rec, 0, sizeof(rec)); rec.e_cpos = cpu_to_le32(cpos); @@ -4700,11 +4697,9 @@ int ocfs2_insert_extent(handle_t *handle, goto bail; } - mlog(0, "Insert.appending: %u, Insert.Contig: %u, " - "Insert.contig_index: %d, Insert.free_records: %d, " - "Insert.tree_depth: %d\n", - insert.ins_appending, insert.ins_contig, insert.ins_contig_index, - free_records, insert.ins_tree_depth); + trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig, + insert.ins_contig_index, free_records, + insert.ins_tree_depth); if (insert.ins_contig == CONTIG_NONE && free_records == 0) { status = ocfs2_grow_tree(handle, et, @@ -4726,7 +4721,6 @@ int ocfs2_insert_extent(handle_t *handle, bail: brelse(last_eb_bh); - mlog_exit(status); return status; } @@ -4746,7 +4740,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) { - int status = 0; + int status = 0, err = 0; int free_extents; enum ocfs2_alloc_restarted reason = RESTART_NONE; u32 bit_off, num_bits; @@ -4773,14 +4767,14 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, * 2) we are so fragmented, we've needed to add metadata too * many times. */ if (!free_extents && !meta_ac) { - mlog(0, "we haven't reserved any metadata!\n"); + err = -1; status = -EAGAIN; reason = RESTART_META; goto leave; } else if ((!free_extents) && (ocfs2_alloc_context_bits_left(meta_ac) < ocfs2_extend_meta_needed(et->et_root_el))) { - mlog(0, "filesystem is really fragmented...\n"); + err = -2; status = -EAGAIN; reason = RESTART_META; goto leave; @@ -4805,9 +4799,9 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, } block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for owner %llu\n", - num_bits, bit_off, - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); + trace_ocfs2_add_clusters_in_btree( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + bit_off, num_bits); status = ocfs2_insert_extent(handle, et, *logical_offset, block, num_bits, flags, meta_ac); if (status < 0) { @@ -4821,16 +4815,15 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, *logical_offset += num_bits; if (clusters_to_add) { - mlog(0, "need to alloc once more, wanted = %u\n", - clusters_to_add); + err = clusters_to_add; status = -EAGAIN; reason = RESTART_TRANS; } leave: - mlog_exit(status); if (reason_ret) *reason_ret = reason; + trace_ocfs2_add_clusters_in_btree_ret(status, reason, err); return status; } @@ -5039,7 +5032,7 @@ int ocfs2_split_extent(handle_t *handle, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); if (ret) { - mlog_exit(ret); + mlog_errno(ret); goto out; } @@ -5056,9 +5049,9 @@ int ocfs2_split_extent(handle_t *handle, ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); - mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n", - split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent, - ctxt.c_split_covers_rec); + trace_ocfs2_split_extent(split_index, ctxt.c_contig_type, + ctxt.c_has_empty_extent, + ctxt.c_split_covers_rec); if (ctxt.c_contig_type == CONTIG_NONE) { if (ctxt.c_split_covers_rec) @@ -5192,8 +5185,9 @@ int ocfs2_mark_extent_written(struct inode *inode, { int ret; - mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n", - inode->i_ino, cpos, len, phys); + trace_ocfs2_mark_extent_written( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + cpos, len, phys); if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " @@ -5512,11 +5506,10 @@ int ocfs2_remove_extent(handle_t *handle, BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); - mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d " - "(cpos %u, len %u)\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - cpos, len, index, - le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); + trace_ocfs2_remove_extent( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len, index, le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, @@ -5795,9 +5788,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - mlog_entry("start_blk = %llu, num_clusters = %u\n", - (unsigned long long)start_blk, num_clusters); - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); @@ -5834,10 +5824,9 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, goto bail; } - mlog(0, "Log truncate of %u clusters starting at cluster %u to " - "%llu (index = %d)\n", num_clusters, start_cluster, - (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); - + trace_ocfs2_truncate_log_append( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index, + start_cluster, num_clusters); if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { /* * Move index back to the record we are coalescing with. @@ -5846,9 +5835,10 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, index--; num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); - mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", - index, le32_to_cpu(tl->tl_recs[index].t_start), - num_clusters); + trace_ocfs2_truncate_log_append( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + index, le32_to_cpu(tl->tl_recs[index].t_start), + num_clusters); } else { tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); tl->tl_used = cpu_to_le16(index + 1); @@ -5859,7 +5849,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, osb->truncated_clusters += num_clusters; bail: - mlog_exit(status); return status; } @@ -5878,8 +5867,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; - mlog_entry_void(); - di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; i = le16_to_cpu(tl->tl_used) - 1; @@ -5915,8 +5902,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, /* if start_blk is not set, we ignore the record as * invalid. */ if (start_blk) { - mlog(0, "free record %d, start = %u, clusters = %u\n", - i, le32_to_cpu(rec.t_start), num_clusters); + trace_ocfs2_replay_truncate_records( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + i, le32_to_cpu(rec.t_start), num_clusters); status = ocfs2_free_clusters(handle, data_alloc_inode, data_alloc_bh, start_blk, @@ -5932,7 +5920,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, osb->truncated_clusters = 0; bail: - mlog_exit(status); return status; } @@ -5949,8 +5936,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - mlog_entry_void(); - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); di = (struct ocfs2_dinode *) tl_bh->b_data; @@ -5962,8 +5947,9 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) tl = &di->id2.i_dealloc; num_to_flush = le16_to_cpu(tl->tl_used); - mlog(0, "Flush %u records from truncate log #%llu\n", - num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); + trace_ocfs2_flush_truncate_log( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + num_to_flush); if (!num_to_flush) { status = 0; goto out; @@ -6009,7 +5995,6 @@ out_mutex: iput(data_alloc_inode); out: - mlog_exit(status); return status; } @@ -6032,15 +6017,11 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) container_of(work, struct ocfs2_super, osb_truncate_log_wq.work); - mlog_entry_void(); - status = ocfs2_flush_truncate_log(osb); if (status < 0) mlog_errno(status); else ocfs2_init_steal_slots(osb); - - mlog_exit(status); } #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) @@ -6086,7 +6067,6 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, *tl_inode = inode; *tl_bh = bh; bail: - mlog_exit(status); return status; } @@ -6106,7 +6086,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, *tl_copy = NULL; - mlog(0, "recover truncate log from slot %d\n", slot_num); + trace_ocfs2_begin_truncate_log_recovery(slot_num); status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); if (status < 0) { @@ -6123,8 +6103,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, tl = &di->id2.i_dealloc; if (le16_to_cpu(tl->tl_used)) { - mlog(0, "We'll have %u logs to recover\n", - le16_to_cpu(tl->tl_used)); + trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used)); *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); if (!(*tl_copy)) { @@ -6157,9 +6136,9 @@ bail: if (status < 0 && (*tl_copy)) { kfree(*tl_copy); *tl_copy = NULL; + mlog_errno(status); } - mlog_exit(status); return status; } @@ -6174,8 +6153,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_truncate_log *tl; - mlog_entry_void(); - if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); return -EINVAL; @@ -6183,8 +6160,9 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, tl = &tl_copy->id2.i_dealloc; num_recs = le16_to_cpu(tl->tl_used); - mlog(0, "cleanup %u records from %llu\n", num_recs, - (unsigned long long)le64_to_cpu(tl_copy->i_blkno)); + trace_ocfs2_complete_truncate_log_recovery( + (unsigned long long)le64_to_cpu(tl_copy->i_blkno), + num_recs); mutex_lock(&tl_inode->i_mutex); for(i = 0; i < num_recs; i++) { @@ -6219,7 +6197,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, bail_up: mutex_unlock(&tl_inode->i_mutex); - mlog_exit(status); return status; } @@ -6228,8 +6205,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; - mlog_entry_void(); - if (tl_inode) { cancel_delayed_work(&osb->osb_truncate_log_wq); flush_workqueue(ocfs2_wq); @@ -6241,8 +6216,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) brelse(osb->osb_tl_bh); iput(osb->osb_tl_inode); } - - mlog_exit_void(); } int ocfs2_truncate_log_init(struct ocfs2_super *osb) @@ -6251,8 +6224,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) struct inode *tl_inode = NULL; struct buffer_head *tl_bh = NULL; - mlog_entry_void(); - status = ocfs2_get_truncate_log_info(osb, osb->slot_num, &tl_inode, @@ -6268,7 +6239,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) osb->osb_tl_bh = tl_bh; osb->osb_tl_inode = tl_inode; - mlog_exit(status); return status; } @@ -6350,8 +6320,8 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, else bg_blkno = ocfs2_which_suballoc_group(head->free_blk, head->free_bit); - mlog(0, "Free bit: (bit %u, blkno %llu)\n", - head->free_bit, (unsigned long long)head->free_blk); + trace_ocfs2_free_cached_blocks( + (unsigned long long)head->free_blk, head->free_bit); ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, head->free_bit, bg_blkno, 1); @@ -6404,8 +6374,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, return ret; } - mlog(0, "Insert clusters: (bit %u, blk %llu)\n", - bit, (unsigned long long)blkno); + trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit); item->free_blk = blkno; item->free_bit = bit; @@ -6480,8 +6449,8 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb, fl = ctxt->c_first_suballocator; if (fl->f_first) { - mlog(0, "Free items: (type %u, slot %d)\n", - fl->f_inode_type, fl->f_slot); + trace_ocfs2_run_deallocs(fl->f_inode_type, + fl->f_slot); ret2 = ocfs2_free_cached_blocks(osb, fl->f_inode_type, fl->f_slot, @@ -6558,8 +6527,9 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, goto out; } - mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", - type, slot, bit, (unsigned long long)blkno); + trace_ocfs2_cache_block_dealloc(type, slot, + (unsigned long long)suballoc, + (unsigned long long)blkno, bit); item->free_bg = suballoc; item->free_blk = blkno; @@ -7005,8 +6975,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct ocfs2_extent_tree et; struct ocfs2_cached_dealloc_ctxt dealloc; - mlog_entry_void(); - ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&dealloc); @@ -7041,8 +7009,11 @@ start: goto bail; } - mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", - OCFS2_I(inode)->ip_clusters, path->p_tree_depth); + trace_ocfs2_commit_truncate( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + new_highest_cpos, + OCFS2_I(inode)->ip_clusters, + path->p_tree_depth); /* * By now, el will point to the extent list on the bottom most @@ -7136,7 +7107,6 @@ bail: ocfs2_free_path(path); - mlog_exit(status); return status; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index daea0359e97..ac97bca282d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -29,7 +29,6 @@ #include <linux/mpage.h> #include <linux/quotaops.h> -#define MLOG_MASK_PREFIX ML_FILE_IO #include <cluster/masklog.h> #include "ocfs2.h" @@ -45,6 +44,7 @@ #include "super.h" #include "symlink.h" #include "refcounttree.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -59,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); void *kaddr; - mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, - (unsigned long long)iblock, bh_result, create); + trace_ocfs2_symlink_get_block( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock, bh_result, create); BUG_ON(ocfs2_inode_is_fast_symlink(inode)); @@ -123,7 +124,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, bail: brelse(bh); - mlog_exit(err); return err; } @@ -136,8 +136,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, u64 p_blkno, count, past_eof; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, - (unsigned long long)iblock, bh_result, create); + trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock, bh_result, create); if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", @@ -199,8 +199,9 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, } past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, - (unsigned long long)past_eof); + + trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)past_eof); if (create && (iblock >= past_eof)) set_buffer_new(bh_result); @@ -208,7 +209,6 @@ bail: if (err < 0) err = -EIO; - mlog_exit(err); return err; } @@ -278,7 +278,8 @@ static int ocfs2_readpage(struct file *file, struct page *page) loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; int ret, unlock = 1; - mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); + trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, + (page ? page->index : 0)); ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); if (ret != 0) { @@ -323,7 +324,6 @@ out_inode_unlock: out: if (unlock) unlock_page(page); - mlog_exit(ret); return ret; } @@ -396,15 +396,11 @@ out_unlock: */ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) { - int ret; - - mlog_entry("(0x%p)\n", page); - - ret = block_write_full_page(page, ocfs2_get_block, wbc); + trace_ocfs2_writepage( + (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno, + page->index); - mlog_exit(ret); - - return ret; + return block_write_full_page(page, ocfs2_get_block, wbc); } /* Taken from ext3. We don't necessarily need the full blown @@ -450,7 +446,8 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) int err = 0; struct inode *inode = mapping->host; - mlog_entry("(block = %llu)\n", (unsigned long long)block); + trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)block); /* We don't need to lock journal system files, since they aren't * accessed concurrently from multiple nodes. @@ -484,8 +481,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) bail: status = err ? 0 : p_blkno; - mlog_exit((int)status); - return status; } @@ -616,9 +611,6 @@ static ssize_t ocfs2_direct_IO(int rw, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; - int ret; - - mlog_entry_void(); /* * Fallback to buffered I/O if we see an inode without @@ -631,13 +623,10 @@ static ssize_t ocfs2_direct_IO(int rw, if (i_size_read(inode) <= offset) return 0; - ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, - iov, offset, nr_segs, - ocfs2_direct_IO_get_blocks, - ocfs2_dio_end_io, NULL, 0); - - mlog_exit(ret); - return ret; + return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); } static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, @@ -1026,6 +1015,12 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, &cluster_start, &cluster_end); + /* treat the write as new if the a hole/lseek spanned across + * the page boundary. + */ + new = new | ((i_size_read(inode) <= page_offset(page)) && + (page_offset(page) <= user_pos)); + if (page == wc->w_target_page) { map_from = user_pos & (PAGE_CACHE_SIZE - 1); map_to = map_from + user_len; @@ -1534,9 +1529,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = NULL; - mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", - (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, - oi->ip_dyn_features); + trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno, + len, (unsigned long long)pos, + oi->ip_dyn_features); /* * Handle inodes which already have inline data 1st. @@ -1739,6 +1734,13 @@ try_again: di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + trace_ocfs2_write_begin_nolock( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), + le32_to_cpu(di->i_clusters), + pos, len, flags, mmap_page, + clusters_to_alloc, extents_to_split); + /* * We set w_target_from, w_target_to here so that * ocfs2_write_end() knows which range in the target page to @@ -1751,12 +1753,6 @@ try_again: * ocfs2_lock_allocators(). It greatly over-estimates * the work to be done. */ - mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u," - " clusters_to_add = %u, extents_to_split = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), - clusters_to_alloc, extents_to_split); - ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), wc->w_di_bh); ret = ocfs2_lock_allocators(inode, &et, @@ -1938,8 +1934,8 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); kunmap_atomic(kaddr, KM_USER0); - mlog(0, "Data written to inode at offset %llu. " - "id_count = %u, copied = %u, i_dyn_features = 0x%x\n", + trace_ocfs2_write_end_inline( + (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)pos, *copied, le16_to_cpu(di->id2.i_data.id_count), le16_to_cpu(di->i_dyn_features)); diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index f9d5d3ffc75..5d18ad10c27 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -35,8 +35,8 @@ #include "inode.h" #include "journal.h" #include "uptodate.h" - #include "buffer_head_io.h" +#include "ocfs2_trace.h" /* * Bits on bh->b_state used by ocfs2. @@ -55,8 +55,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, { int ret = 0; - mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n", - (unsigned long long)bh->b_blocknr, ci); + trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci); BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); BUG_ON(buffer_jbd(bh)); @@ -66,6 +65,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, * can get modified during recovery even if read-only. */ if (ocfs2_is_hard_readonly(osb)) { ret = -EROFS; + mlog_errno(ret); goto out; } @@ -91,11 +91,11 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, * uptodate. */ ret = -EIO; put_bh(bh); + mlog_errno(ret); } ocfs2_metadata_cache_io_unlock(ci); out: - mlog_exit(ret); return ret; } @@ -106,10 +106,10 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int i; struct buffer_head *bh; - if (!nr) { - mlog(ML_BH_IO, "No buffers will be read!\n"); + trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); + + if (!nr) goto bail; - } for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { @@ -123,10 +123,8 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, bh = bhs[i]; if (buffer_jbd(bh)) { - mlog(ML_BH_IO, - "trying to sync read a jbd " - "managed bh (blocknr = %llu), skipping\n", - (unsigned long long)bh->b_blocknr); + trace_ocfs2_read_blocks_sync_jbd( + (unsigned long long)bh->b_blocknr); continue; } @@ -186,8 +184,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bh; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); - mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n", - ci, (unsigned long long)block, nr, flags); + trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); BUG_ON(!ci); BUG_ON((flags & OCFS2_BH_READAHEAD) && @@ -207,7 +204,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, } if (nr == 0) { - mlog(ML_BH_IO, "No buffers will be read!\n"); status = 0; goto bail; } @@ -251,8 +247,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, */ if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) { - mlog(ML_UPTODATE, - "bh (%llu), owner %llu not uptodate\n", + trace_ocfs2_read_blocks_from_disk( (unsigned long long)bh->b_blocknr, (unsigned long long)ocfs2_metadata_cache_owner(ci)); /* We're using ignore_cache here to say @@ -260,11 +255,10 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, ignore_cache = 1; } + trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr, + ignore_cache, buffer_jbd(bh), buffer_dirty(bh)); + if (buffer_jbd(bh)) { - if (ignore_cache) - mlog(ML_BH_IO, "trying to sync read a jbd " - "managed bh (blocknr = %llu)\n", - (unsigned long long)bh->b_blocknr); continue; } @@ -272,9 +266,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ - mlog(ML_BH_IO, "asking me to sync read a dirty " - "buffer! (blocknr = %llu)\n", - (unsigned long long)bh->b_blocknr); continue; } @@ -367,14 +358,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, } ocfs2_metadata_cache_io_unlock(ci); - mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", - (unsigned long long)block, nr, - ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", - flags); + trace_ocfs2_read_blocks_end((unsigned long long)block, nr, + flags, ignore_cache); bail: - mlog_exit(status); return status; } @@ -408,13 +396,12 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, int ret = 0; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; - mlog_entry_void(); - BUG_ON(buffer_jbd(bh)); ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr); if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) { ret = -EROFS; + mlog_errno(ret); goto out; } @@ -434,9 +421,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, if (!buffer_uptodate(bh)) { ret = -EIO; put_bh(bh); + mlog_errno(ret); } out: - mlog_exit(ret); return ret; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 1adab287bd2..2461eb3272e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1654,8 +1654,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; struct o2hb_disk_heartbeat_block *hb_block; - mlog_entry_void(); - ret = o2hb_read_slots(reg, reg->hr_blocks); if (ret) { mlog_errno(ret); @@ -1677,7 +1675,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) } out: - mlog_exit(ret); return ret; } diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 6c61771469a..07ac24fd925 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -30,7 +30,7 @@ struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK); EXPORT_SYMBOL_GPL(mlog_and_bits); -struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK); +struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0); EXPORT_SYMBOL_GPL(mlog_not_bits); static ssize_t mlog_mask_show(u64 mask, char *buf) @@ -80,8 +80,6 @@ struct mlog_attribute { } static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { - define_mask(ENTRY), - define_mask(EXIT), define_mask(TCP), define_mask(MSG), define_mask(SOCKET), @@ -93,27 +91,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(DLM_THREAD), define_mask(DLM_MASTER), define_mask(DLM_RECOVERY), - define_mask(AIO), - define_mask(JOURNAL), - define_mask(DISK_ALLOC), - define_mask(SUPER), - define_mask(FILE_IO), - define_mask(EXTENT_MAP), define_mask(DLM_GLUE), - define_mask(BH_IO), - define_mask(UPTODATE), - define_mask(NAMEI), - define_mask(INODE), define_mask(VOTE), - define_mask(DCACHE), define_mask(CONN), define_mask(QUORUM), - define_mask(EXPORT), - define_mask(XATTR), - define_mask(QUOTA), - define_mask(REFCOUNT), define_mask(BASTS), - define_mask(RESERVATIONS), define_mask(CLUSTER), define_mask(ERROR), define_mask(NOTICE), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 34d6544357d..baa2b9ef7ee 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -82,41 +82,23 @@ /* bits that are frequently given and infrequently matched in the low word */ /* NOTE: If you add a flag, you need to also update masklog.c! */ -#define ML_ENTRY 0x0000000000000001ULL /* func call entry */ -#define ML_EXIT 0x0000000000000002ULL /* func call exit */ -#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */ -#define ML_MSG 0x0000000000000008ULL /* net network messages */ -#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */ -#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */ -#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */ -#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */ -#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */ -#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */ -#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */ -#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */ -#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */ -#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */ -#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */ -#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */ -#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */ -#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */ -#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */ -#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */ -#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */ -#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */ -#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */ -#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */ -#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */ -#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */ -#define ML_CONN 0x0000000004000000ULL /* net connection management */ -#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ -#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ -#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ -#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ -#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ -#define ML_BASTS 0x0000000100000000ULL /* dlmglue asts and basts */ -#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */ -#define ML_CLUSTER 0x0000000400000000ULL /* cluster stack */ +#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */ +#define ML_MSG 0x0000000000000002ULL /* net network messages */ +#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */ +#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */ +#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */ +#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */ +#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */ +#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */ +#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */ +#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */ +#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */ +#define ML_DLM_GLUE 0x0000000000000800ULL /* ocfs2 dlm glue layer */ +#define ML_VOTE 0x0000000000001000ULL /* ocfs2 node messaging */ +#define ML_CONN 0x0000000000002000ULL /* net connection management */ +#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */ +#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */ +#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */ @@ -124,7 +106,6 @@ #define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) -#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) #ifndef MLOG_MASK_PREFIX #define MLOG_MASK_PREFIX 0 #endif @@ -222,58 +203,6 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ } while (0) -#if defined(CONFIG_OCFS2_DEBUG_MASKLOG) -#define mlog_entry(fmt, args...) do { \ - mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \ -} while (0) - -#define mlog_entry_void() do { \ - mlog(ML_ENTRY, "ENTRY:\n"); \ -} while (0) - -/* - * We disable this for sparse. - */ -#if !defined(__CHECKER__) -#define mlog_exit(st) do { \ - if (__builtin_types_compatible_p(typeof(st), unsigned long)) \ - mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \ - else if (__builtin_types_compatible_p(typeof(st), signed long)) \ - mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \ - else if (__builtin_types_compatible_p(typeof(st), unsigned int) \ - || __builtin_types_compatible_p(typeof(st), unsigned short) \ - || __builtin_types_compatible_p(typeof(st), unsigned char)) \ - mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \ - else if (__builtin_types_compatible_p(typeof(st), signed int) \ - || __builtin_types_compatible_p(typeof(st), signed short) \ - || __builtin_types_compatible_p(typeof(st), signed char)) \ - mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \ - else if (__builtin_types_compatible_p(typeof(st), long long)) \ - mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \ - else \ - mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \ -} while (0) -#else -#define mlog_exit(st) do { \ - mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \ -} while (0) -#endif - -#define mlog_exit_ptr(ptr) do { \ - mlog(ML_EXIT, "EXIT: %p\n", ptr); \ -} while (0) - -#define mlog_exit_void() do { \ - mlog(ML_EXIT, "EXIT\n"); \ -} while (0) -#else -#define mlog_entry(...) do { } while (0) -#define mlog_entry_void(...) do { } while (0) -#define mlog_exit(...) do { } while (0) -#define mlog_exit_ptr(...) do { } while (0) -#define mlog_exit_void(...) do { } while (0) -#endif /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */ - #define mlog_bug_on_msg(cond, fmt, args...) do { \ if (cond) { \ mlog(ML_ERROR, "bug expression: " #cond "\n"); \ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 3b11cb1e38f..ee04ff5ee60 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -210,10 +210,6 @@ static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc) sc->sc_tv_func_stop = ktime_get(); } -static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc) -{ - return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start); -} #else /* CONFIG_DEBUG_FS */ # define o2net_init_nst(a, b, c, d, e) # define o2net_set_nst_sock_time(a) @@ -227,10 +223,14 @@ static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc) # define o2net_set_advance_stop_time(a) # define o2net_set_func_start_time(a) # define o2net_set_func_stop_time(a) -# define o2net_get_func_run_time(a) (ktime_t)0 #endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_OCFS2_FS_STATS +static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc) +{ + return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start); +} + static void o2net_update_send_stats(struct o2net_send_tracking *nst, struct o2net_sock_container *sc) { diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 7eb90403fc8..e5ba3481833 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -28,7 +28,6 @@ #include <linux/slab.h> #include <linux/namei.h> -#define MLOG_MASK_PREFIX ML_DCACHE #include <cluster/masklog.h> #include "ocfs2.h" @@ -39,6 +38,7 @@ #include "file.h" #include "inode.h" #include "super.h" +#include "ocfs2_trace.h" void ocfs2_dentry_attach_gen(struct dentry *dentry) { @@ -62,8 +62,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, inode = dentry->d_inode; osb = OCFS2_SB(dentry->d_sb); - mlog_entry("(0x%p, '%.*s')\n", dentry, - dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, + dentry->d_name.name); /* For a negative dentry - * check the generation number of the parent and compare with the @@ -73,9 +73,10 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned long gen = (unsigned long) dentry->d_fsdata; unsigned long pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; - mlog(0, "negative dentry: %.*s parent gen: %lu " - "dentry gen: %lu\n", - dentry->d_name.len, dentry->d_name.name, pgen, gen); + + trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, + dentry->d_name.name, + pgen, gen); if (gen != pgen) goto bail; goto valid; @@ -90,8 +91,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, /* did we or someone else delete this inode? */ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { spin_unlock(&OCFS2_I(inode)->ip_lock); - mlog(0, "inode (%llu) deleted, returning false\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + trace_ocfs2_dentry_revalidate_delete( + (unsigned long long)OCFS2_I(inode)->ip_blkno); goto bail; } spin_unlock(&OCFS2_I(inode)->ip_lock); @@ -101,10 +102,9 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, * inode nlink hits zero, it never goes back. */ if (inode->i_nlink == 0) { - mlog(0, "Inode %llu orphaned, returning false " - "dir = %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - S_ISDIR(inode->i_mode)); + trace_ocfs2_dentry_revalidate_orphaned( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + S_ISDIR(inode->i_mode)); goto bail; } @@ -113,9 +113,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, * redo it. */ if (!dentry->d_fsdata) { - mlog(0, "Inode %llu doesn't have dentry lock, " - "returning false\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + trace_ocfs2_dentry_revalidate_nofsdata( + (unsigned long long)OCFS2_I(inode)->ip_blkno); goto bail; } @@ -123,8 +122,7 @@ valid: ret = 1; bail: - mlog_exit(ret); - + trace_ocfs2_dentry_revalidate_ret(ret); return ret; } @@ -181,8 +179,8 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { - mlog(0, "dentry found: %.*s\n", - dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_find_local_alias(dentry->d_name.len, + dentry->d_name.name); dget_dlock(dentry); spin_unlock(&dentry->d_lock); @@ -240,9 +238,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct dentry *alias; struct ocfs2_dentry_lock *dl = dentry->d_fsdata; - mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n", - dentry->d_name.len, dentry->d_name.name, - (unsigned long long)parent_blkno, dl); + trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name, + (unsigned long long)parent_blkno, dl); /* * Negative dentry. We ignore these for now. @@ -292,7 +289,9 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, (unsigned long long)parent_blkno, (unsigned long long)dl->dl_parent_blkno); - mlog(0, "Found: %s\n", dl->dl_lockres.l_name); + trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name, + (unsigned long long)parent_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); goto out_attach; } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f97b6f1c61d..9fe5b8fd658 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -43,7 +43,6 @@ #include <linux/quotaops.h> #include <linux/sort.h> -#define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> #include "ocfs2.h" @@ -61,6 +60,7 @@ #include "super.h" #include "sysfile.h" #include "uptodate.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -322,21 +322,23 @@ static int ocfs2_check_dir_entry(struct inode * dir, const char *error_msg = NULL; const int rlen = le16_to_cpu(de->rec_len); - if (rlen < OCFS2_DIR_REC_LEN(1)) + if (unlikely(rlen < OCFS2_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; - else if (rlen % 4 != 0) + else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (rlen < OCFS2_DIR_REC_LEN(de->name_len)) + else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + else if (unlikely( + ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)) error_msg = "directory entry across blocks"; - if (error_msg != NULL) + if (unlikely(error_msg != NULL)) mlog(ML_ERROR, "bad entry in directory #%llu: %s - " "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg, offset, (unsigned long long)le64_to_cpu(de->inode), rlen, de->name_len); + return error_msg == NULL ? 1 : 0; } @@ -367,8 +369,6 @@ static inline int ocfs2_search_dirblock(struct buffer_head *bh, int de_len; int ret = 0; - mlog_entry_void(); - de_buf = first_de; dlimit = de_buf + bytes; @@ -402,7 +402,7 @@ static inline int ocfs2_search_dirblock(struct buffer_head *bh, } bail: - mlog_exit(ret); + trace_ocfs2_search_dirblock(ret); return ret; } @@ -447,8 +447,7 @@ static int ocfs2_validate_dir_block(struct super_block *sb, * We don't validate dirents here, that's handled * in-place when the code walks them. */ - mlog(0, "Validating dirblock %llu\n", - (unsigned long long)bh->b_blocknr); + trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); @@ -706,8 +705,6 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, int num = 0; int nblocks, i, err; - mlog_entry_void(); - sb = dir->i_sb; nblocks = i_size_read(dir) >> sb->s_blocksize_bits; @@ -788,7 +785,7 @@ cleanup_and_exit: for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); - mlog_exit_ptr(ret); + trace_ocfs2_find_entry_el(ret); return ret; } @@ -950,11 +947,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen, goto out; } - mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x " - "returns: %llu\n", - (unsigned long long)OCFS2_I(dir)->ip_blkno, - namelen, name, hinfo->major_hash, hinfo->minor_hash, - (unsigned long long)phys); + trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno, + namelen, name, hinfo->major_hash, + hinfo->minor_hash, (unsigned long long)phys); ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh); if (ret) { @@ -964,9 +959,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen, dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data; - mlog(0, "leaf info: num_used: %d, count: %d\n", - le16_to_cpu(dx_leaf->dl_list.de_num_used), - le16_to_cpu(dx_leaf->dl_list.de_count)); + trace_ocfs2_dx_dir_search_leaf_info( + le16_to_cpu(dx_leaf->dl_list.de_num_used), + le16_to_cpu(dx_leaf->dl_list.de_count)); entry_list = &dx_leaf->dl_list; @@ -1166,8 +1161,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, int i, status = -ENOENT; ocfs2_journal_access_func access = ocfs2_journal_access_db; - mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); - if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) access = ocfs2_journal_access_di; @@ -1202,7 +1195,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); } bail: - mlog_exit(status); return status; } @@ -1348,8 +1340,8 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, } } - mlog(0, "Dir %llu: delete entry at index: %d\n", - (unsigned long long)OCFS2_I(dir)->ip_blkno, index); + trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno, + index); ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry, leaf_bh, leaf_bh->b_data, leaf_bh->b_size); @@ -1632,8 +1624,6 @@ int __ocfs2_add_entry(handle_t *handle, struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; - mlog_entry_void(); - if (!namelen) return -EINVAL; @@ -1765,8 +1755,9 @@ int __ocfs2_add_entry(handle_t *handle, * from ever getting here. */ retval = -ENOSPC; bail: + if (retval) + mlog_errno(retval); - mlog_exit(retval); return retval; } @@ -2028,8 +2019,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) struct inode *inode = filp->f_path.dentry->d_inode; int lock_level = 0; - mlog_entry("dirino=%llu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); if (lock_level && error >= 0) { @@ -2051,9 +2041,10 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) dirent, filldir, NULL); ocfs2_inode_unlock(inode, lock_level); + if (error) + mlog_errno(error); bail_nolock: - mlog_exit(error); return error; } @@ -2069,8 +2060,8 @@ int ocfs2_find_files_on_disk(const char *name, { int status = -ENOENT; - mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno, - (unsigned long long)OCFS2_I(inode)->ip_blkno); + trace_ocfs2_find_files_on_disk(namelen, name, blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); status = ocfs2_find_entry(name, namelen, inode, lookup); if (status) @@ -2114,8 +2105,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir, int ret; struct ocfs2_dir_lookup_result lookup = { NULL, }; - mlog_entry("dir %llu, name '%.*s'\n", - (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); + trace_ocfs2_check_dir_for_entry( + (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); ret = -EEXIST; if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) @@ -2125,7 +2116,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir, bail: ocfs2_free_dir_lookup_result(&lookup); - mlog_exit(ret); + if (ret) + mlog_errno(ret); return ret; } @@ -2324,8 +2316,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, struct buffer_head *new_bh = NULL; struct ocfs2_dir_entry *de; - mlog_entry_void(); - if (ocfs2_new_dir_wants_trailer(inode)) size = ocfs2_dir_trailer_blk_off(parent->i_sb); @@ -2380,7 +2370,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, bail: brelse(new_bh); - mlog_exit(status); return status; } @@ -2409,9 +2398,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, goto out; } - mlog(0, "Dir %llu, attach new index block: %llu\n", - (unsigned long long)OCFS2_I(dir)->ip_blkno, - (unsigned long long)dr_blkno); + trace_ocfs2_dx_dir_attach_index( + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)dr_blkno); dx_root_bh = sb_getblk(osb->sb, dr_blkno); if (dx_root_bh == NULL) { @@ -2511,11 +2500,10 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, dx_leaf->dl_list.de_count = cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb)); - mlog(0, - "Dir %llu, format dx_leaf: %llu, entry count: %u\n", - (unsigned long long)OCFS2_I(dir)->ip_blkno, - (unsigned long long)bh->b_blocknr, - le16_to_cpu(dx_leaf->dl_list.de_count)); + trace_ocfs2_dx_dir_format_cluster( + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)bh->b_blocknr, + le16_to_cpu(dx_leaf->dl_list.de_count)); ocfs2_journal_dirty(handle, bh); } @@ -2759,12 +2747,11 @@ static void ocfs2_dx_dir_index_root_block(struct inode *dir, ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo); - mlog(0, - "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n", - (unsigned long long)dir->i_ino, hinfo.major_hash, - hinfo.minor_hash, - le16_to_cpu(dx_root->dr_entries.de_num_used), - de->name_len, de->name); + trace_ocfs2_dx_dir_index_root_block( + (unsigned long long)dir->i_ino, + hinfo.major_hash, hinfo.minor_hash, + de->name_len, de->name, + le16_to_cpu(dx_root->dr_entries.de_num_used)); ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo, dirent_blk); @@ -3235,7 +3222,6 @@ static int ocfs2_do_extend_dir(struct super_block *sb, bail: if (did_quota && status < 0) dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); - mlog_exit(status); return status; } @@ -3270,8 +3256,6 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, struct ocfs2_extent_tree et; struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; - mlog_entry_void(); - if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { /* * This would be a code error as an inline directory should @@ -3320,8 +3304,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, down_write(&OCFS2_I(dir)->ip_alloc_sem); drop_alloc_sem = 1; dir_i_size = i_size_read(dir); - mlog(0, "extending dir %llu (i_size = %lld)\n", - (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size); + trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno, + dir_i_size); /* dir->i_size is always block aligned. */ spin_lock(&OCFS2_I(dir)->ip_lock); @@ -3436,7 +3420,6 @@ bail: brelse(new_bh); - mlog_exit(status); return status; } @@ -3583,8 +3566,9 @@ next: status = 0; bail: brelse(bh); + if (status) + mlog_errno(status); - mlog_exit(status); return status; } @@ -3815,9 +3799,9 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, struct ocfs2_dx_root_block *dx_root; struct ocfs2_dx_leaf *tmp_dx_leaf = NULL; - mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n", - (unsigned long long)OCFS2_I(dir)->ip_blkno, - (unsigned long long)leaf_blkno, insert_hash); + trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)leaf_blkno, + insert_hash); ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); @@ -3897,8 +3881,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, goto out_commit; } - mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n", - leaf_cpos, split_hash, insert_hash); + trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash); /* * We have to carefully order operations here. There are items @@ -4355,8 +4338,8 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, unsigned int blocks_wanted = 1; struct buffer_head *bh = NULL; - mlog(0, "getting ready to insert namelen %d into dir %llu\n", - namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); + trace_ocfs2_prepare_dir_for_insert( + (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen); if (!namelen) { ret = -EINVAL; diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 9f30491e5e8..29a886d1e82 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -128,8 +128,8 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); - mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n", - lock->ml.type, lock->ml.convert_type, type); + mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n", + lock->ml.type, lock->ml.convert_type, type); spin_lock(&lock->spinlock); @@ -353,7 +353,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, struct kvec vec[2]; size_t veclen = 1; - mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + mlog(0, "%.*s\n", res->lockname.len, res->lockname.name); memset(&convert, 0, sizeof(struct dlm_convert_lock)); convert.node_idx = dlm->node_num; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7e38a072d72..7540a492eab 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -188,7 +188,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, struct hlist_head *bucket; struct hlist_node *list; - mlog_entry("%.*s\n", len, name); + mlog(0, "%.*s\n", len, name); assert_spin_locked(&dlm->spinlock); @@ -222,7 +222,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, { struct dlm_lock_resource *res = NULL; - mlog_entry("%.*s\n", len, name); + mlog(0, "%.*s\n", len, name); assert_spin_locked(&dlm->spinlock); @@ -531,7 +531,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, unsigned int node; struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; - mlog_entry("%p %u %p", msg, len, data); + mlog(0, "%p %u %p", msg, len, data); if (!dlm_grab(dlm)) return 0; @@ -926,9 +926,10 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, } static int dlm_match_regions(struct dlm_ctxt *dlm, - struct dlm_query_region *qr) + struct dlm_query_region *qr, + char *local, int locallen) { - char *local = NULL, *remote = qr->qr_regions; + char *remote = qr->qr_regions; char *l, *r; int localnr, i, j, foundit; int status = 0; @@ -957,13 +958,8 @@ static int dlm_match_regions(struct dlm_ctxt *dlm, r += O2HB_MAX_REGION_NAME_LEN; } - local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC); - if (!local) { - status = -ENOMEM; - goto bail; - } - - localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS); + localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN); + localnr = o2hb_get_all_regions(local, (u8)localnr); /* compare local regions with remote */ l = local; @@ -1012,8 +1008,6 @@ static int dlm_match_regions(struct dlm_ctxt *dlm, } bail: - kfree(local); - return status; } @@ -1075,6 +1069,7 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, { struct dlm_query_region *qr; struct dlm_ctxt *dlm = NULL; + char *local = NULL; int status = 0; int locked = 0; @@ -1083,6 +1078,13 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node, qr->qr_domain); + /* buffer used in dlm_mast_regions() */ + local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); + if (!local) { + status = -ENOMEM; + goto bail; + } + status = -EINVAL; spin_lock(&dlm_domain_lock); @@ -1112,13 +1114,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, goto bail; } - status = dlm_match_regions(dlm, qr); + status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); bail: if (locked) spin_unlock(&dlm->spinlock); spin_unlock(&dlm_domain_lock); + kfree(local); + return status; } @@ -1553,7 +1557,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) struct domain_join_ctxt *ctxt; enum dlm_query_join_response_code response = JOIN_DISALLOW; - mlog_entry("%p", dlm); + mlog(0, "%p", dlm); ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); if (!ctxt) { diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 7009292aac5..8d39e0fd66f 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -128,7 +128,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, int call_ast = 0, kick_thread = 0; enum dlm_status status = DLM_NORMAL; - mlog_entry("type=%d\n", lock->ml.type); + mlog(0, "type=%d\n", lock->ml.type); spin_lock(&res->spinlock); /* if called from dlm_create_lock_handler, need to @@ -227,8 +227,8 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, enum dlm_status status = DLM_DENIED; int lockres_changed = 1; - mlog_entry("type=%d\n", lock->ml.type); - mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, + mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n", + lock->ml.type, res->lockname.len, res->lockname.name, flags); spin_lock(&res->spinlock); @@ -308,8 +308,6 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, int tmpret, status = 0; enum dlm_status ret; - mlog_entry_void(); - memset(&create, 0, sizeof(create)); create.node_idx = dlm->node_num; create.requested_type = lock->ml.type; @@ -477,8 +475,6 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data, BUG_ON(!dlm); - mlog_entry_void(); - if (!dlm_grab(dlm)) return DLM_REJECTED; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 59f0f6bdfc6..9d67610dfc7 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -426,8 +426,6 @@ static void dlm_mle_release(struct kref *kref) struct dlm_master_list_entry *mle; struct dlm_ctxt *dlm; - mlog_entry_void(); - mle = container_of(kref, struct dlm_master_list_entry, mle_refs); dlm = mle->dlm; @@ -3120,8 +3118,6 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, *oldmle = NULL; - mlog_entry_void(); - assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); @@ -3261,7 +3257,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) struct hlist_node *list; unsigned int i; - mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); + mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); top: assert_spin_locked(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index aaaffbcbe91..f1beb6fc254 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -727,7 +727,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) if (destroy) dlm_destroy_recovery_area(dlm, dead_node); - mlog_exit(status); return status; } @@ -1496,9 +1495,9 @@ leave: kfree(buf); if (item) kfree(item); + mlog_errno(ret); } - mlog_exit(ret); return ret; } @@ -1567,7 +1566,6 @@ leave: dlm_lockres_put(res); } kfree(data); - mlog_exit(ret); } @@ -1986,7 +1984,6 @@ leave: dlm_lock_put(newlock); } - mlog_exit(ret); return ret; } @@ -2083,8 +2080,6 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, struct hlist_head *bucket; struct dlm_lock_resource *res, *next; - mlog_entry_void(); - assert_spin_locked(&dlm->spinlock); list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { @@ -2607,8 +2602,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) int nodenum; int status; - mlog_entry("%u\n", dead_node); - mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); spin_lock(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 817287c6a6d..850aa7e8753 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -317,7 +317,7 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, struct kvec vec[2]; size_t veclen = 1; - mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + mlog(0, "%.*s\n", res->lockname.len, res->lockname.name); if (owner == dlm->node_num) { /* ended up trying to contact ourself. this means @@ -588,8 +588,6 @@ enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb, struct dlm_lock *lock = NULL; int call_ast, is_master; - mlog_entry_void(); - if (!lksb) { dlm_error(DLM_BADARGS); return DLM_BADARGS; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e8d94d722ec..7642d7ca73e 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -64,7 +64,7 @@ struct ocfs2_mask_waiter { unsigned long mw_mask; unsigned long mw_goal; #ifdef CONFIG_OCFS2_FS_STATS - unsigned long long mw_lock_start; + ktime_t mw_lock_start; #endif }; @@ -397,8 +397,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type, { int len; - mlog_entry_void(); - BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x", @@ -408,8 +406,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type, BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); mlog(0, "built lock resource with name: %s\n", name); - - mlog_exit_void(); } static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); @@ -435,44 +431,41 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) #ifdef CONFIG_OCFS2_FS_STATS static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) { - res->l_lock_num_prmode = 0; - res->l_lock_num_prmode_failed = 0; - res->l_lock_total_prmode = 0; - res->l_lock_max_prmode = 0; - res->l_lock_num_exmode = 0; - res->l_lock_num_exmode_failed = 0; - res->l_lock_total_exmode = 0; - res->l_lock_max_exmode = 0; res->l_lock_refresh = 0; + memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats)); + memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats)); } static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, struct ocfs2_mask_waiter *mw, int ret) { - unsigned long long *num, *sum; - unsigned int *max, *failed; - struct timespec ts = current_kernel_time(); - unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start; - - if (level == LKM_PRMODE) { - num = &res->l_lock_num_prmode; - sum = &res->l_lock_total_prmode; - max = &res->l_lock_max_prmode; - failed = &res->l_lock_num_prmode_failed; - } else if (level == LKM_EXMODE) { - num = &res->l_lock_num_exmode; - sum = &res->l_lock_total_exmode; - max = &res->l_lock_max_exmode; - failed = &res->l_lock_num_exmode_failed; - } else + u32 usec; + ktime_t kt; + struct ocfs2_lock_stats *stats; + + if (level == LKM_PRMODE) + stats = &res->l_lock_prmode; + else if (level == LKM_EXMODE) + stats = &res->l_lock_exmode; + else return; - (*num)++; - (*sum) += time; - if (time > *max) - *max = time; + kt = ktime_sub(ktime_get(), mw->mw_lock_start); + usec = ktime_to_us(kt); + + stats->ls_gets++; + stats->ls_total += ktime_to_ns(kt); + /* overflow */ + if (unlikely(stats->ls_gets) == 0) { + stats->ls_gets++; + stats->ls_total = ktime_to_ns(kt); + } + + if (stats->ls_max < usec) + stats->ls_max = usec; + if (ret) - (*failed)++; + stats->ls_fail++; } static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) @@ -482,8 +475,7 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) { - struct timespec ts = current_kernel_time(); - mw->mw_lock_start = timespec_to_ns(&ts); + mw->mw_lock_start = ktime_get(); } #else static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) @@ -729,8 +721,6 @@ void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, void ocfs2_lock_res_free(struct ocfs2_lock_res *res) { - mlog_entry_void(); - if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) return; @@ -756,14 +746,11 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) memset(&res->l_lksb, 0, sizeof(res->l_lksb)); res->l_flags = 0UL; - mlog_exit_void(); } static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { - mlog_entry_void(); - BUG_ON(!lockres); switch(level) { @@ -776,15 +763,11 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, default: BUG(); } - - mlog_exit_void(); } static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, int level) { - mlog_entry_void(); - BUG_ON(!lockres); switch(level) { @@ -799,7 +782,6 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, default: BUG(); } - mlog_exit_void(); } /* WARNING: This function lives in a world where the only three lock @@ -846,8 +828,6 @@ static void lockres_clear_flags(struct ocfs2_lock_res *lockres, static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) { - mlog_entry_void(); - BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); @@ -860,14 +840,10 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); } lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); - - mlog_exit_void(); } static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) { - mlog_entry_void(); - BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); @@ -889,14 +865,10 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); - - mlog_exit_void(); } static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) { - mlog_entry_void(); - BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); @@ -908,15 +880,12 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc lockres->l_level = lockres->l_requested; lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); - - mlog_exit_void(); } static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level) { int needs_downconvert = 0; - mlog_entry_void(); assert_spin_locked(&lockres->l_lock); @@ -938,8 +907,7 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, if (needs_downconvert) lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); - - mlog_exit(needs_downconvert); + mlog(0, "needs_downconvert = %d\n", needs_downconvert); return needs_downconvert; } @@ -1151,8 +1119,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error) struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); unsigned long flags; - mlog_entry_void(); - mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n", lockres->l_name, lockres->l_unlock_action); @@ -1162,7 +1128,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error) "unlock_action %d\n", error, lockres->l_name, lockres->l_unlock_action); spin_unlock_irqrestore(&lockres->l_lock, flags); - mlog_exit_void(); return; } @@ -1186,8 +1151,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error) lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; wake_up(&lockres->l_event); spin_unlock_irqrestore(&lockres->l_lock, flags); - - mlog_exit_void(); } /* @@ -1233,7 +1196,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, { unsigned long flags; - mlog_entry_void(); spin_lock_irqsave(&lockres->l_lock, flags); lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); @@ -1244,7 +1206,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, spin_unlock_irqrestore(&lockres->l_lock, flags); wake_up(&lockres->l_event); - mlog_exit_void(); } /* Note: If we detect another process working on the lock (i.e., @@ -1260,8 +1221,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb, unsigned long flags; unsigned int gen; - mlog_entry_void(); - mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level, dlm_flags); @@ -1293,7 +1252,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb, mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name); bail: - mlog_exit(ret); return ret; } @@ -1416,8 +1374,6 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unsigned int gen; int noqueue_attempted = 0; - mlog_entry_void(); - ocfs2_init_mask_waiter(&mw); if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) @@ -1583,7 +1539,6 @@ out: caller_ip); } #endif - mlog_exit(ret); return ret; } @@ -1605,7 +1560,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, { unsigned long flags; - mlog_entry_void(); spin_lock_irqsave(&lockres->l_lock, flags); ocfs2_dec_holders(lockres, level); ocfs2_downconvert_on_unlock(osb, lockres); @@ -1614,7 +1568,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, if (lockres->l_lockdep_map.key != NULL) rwsem_release(&lockres->l_lockdep_map, 1, caller_ip); #endif - mlog_exit_void(); } static int ocfs2_create_new_lock(struct ocfs2_super *osb, @@ -1648,8 +1601,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode) BUG_ON(!inode); BUG_ON(!ocfs2_inode_is_new(inode)); - mlog_entry_void(); - mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); /* NOTE: That we don't increment any of the holder counts, nor @@ -1683,7 +1634,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode) } bail: - mlog_exit(ret); return ret; } @@ -1695,16 +1645,12 @@ int ocfs2_rw_lock(struct inode *inode, int write) BUG_ON(!inode); - mlog_entry_void(); - mlog(0, "inode %llu take %s RW lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); - if (ocfs2_mount_local(osb)) { - mlog_exit(0); + if (ocfs2_mount_local(osb)) return 0; - } lockres = &OCFS2_I(inode)->ip_rw_lockres; @@ -1715,7 +1661,6 @@ int ocfs2_rw_lock(struct inode *inode, int write) if (status < 0) mlog_errno(status); - mlog_exit(status); return status; } @@ -1725,16 +1670,12 @@ void ocfs2_rw_unlock(struct inode *inode, int write) struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry_void(); - mlog(0, "inode %llu drop %s RW lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); if (!ocfs2_mount_local(osb)) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); - - mlog_exit_void(); } /* @@ -1748,8 +1689,6 @@ int ocfs2_open_lock(struct inode *inode) BUG_ON(!inode); - mlog_entry_void(); - mlog(0, "inode %llu take PRMODE open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1764,7 +1703,6 @@ int ocfs2_open_lock(struct inode *inode) mlog_errno(status); out: - mlog_exit(status); return status; } @@ -1776,8 +1714,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write) BUG_ON(!inode); - mlog_entry_void(); - mlog(0, "inode %llu try to take %s open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); @@ -1799,7 +1735,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write) level, DLM_LKF_NOQUEUE, 0); out: - mlog_exit(status); return status; } @@ -1811,8 +1746,6 @@ void ocfs2_open_unlock(struct inode *inode) struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry_void(); - mlog(0, "inode %llu drop open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1827,7 +1760,7 @@ void ocfs2_open_unlock(struct inode *inode) DLM_LOCK_EX); out: - mlog_exit_void(); + return; } static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres, @@ -2043,8 +1976,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, { int kick = 0; - mlog_entry_void(); - /* If we know that another node is waiting on our lock, kick * the downconvert thread * pre-emptively when we reach a release * condition. */ @@ -2065,8 +1996,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, if (kick) ocfs2_wake_downconvert_thread(osb); - - mlog_exit_void(); } #define OCFS2_SEC_BITS 34 @@ -2095,8 +2024,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; - mlog_entry_void(); - lvb = ocfs2_dlm_lvb(&lockres->l_lksb); /* @@ -2128,8 +2055,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) out: mlog_meta_lvb(0, lockres); - - mlog_exit_void(); } static void ocfs2_unpack_timespec(struct timespec *spec, @@ -2145,8 +2070,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; - mlog_entry_void(); - mlog_meta_lvb(0, lockres); lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2177,8 +2100,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) ocfs2_unpack_timespec(&inode->i_ctime, be64_to_cpu(lvb->lvb_ictime_packed)); spin_unlock(&oi->ip_lock); - - mlog_exit_void(); } static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, @@ -2205,8 +2126,6 @@ static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) unsigned long flags; int status = 0; - mlog_entry_void(); - refresh_check: spin_lock_irqsave(&lockres->l_lock, flags); if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { @@ -2227,7 +2146,7 @@ refresh_check: status = 1; bail: - mlog_exit(status); + mlog(0, "status %d\n", status); return status; } @@ -2237,7 +2156,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre int status) { unsigned long flags; - mlog_entry_void(); spin_lock_irqsave(&lockres->l_lock, flags); lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); @@ -2246,8 +2164,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre spin_unlock_irqrestore(&lockres->l_lock, flags); wake_up(&lockres->l_event); - - mlog_exit_void(); } /* may or may not return a bh if it went to disk. */ @@ -2260,8 +2176,6 @@ static int ocfs2_inode_lock_update(struct inode *inode, struct ocfs2_dinode *fe; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry_void(); - if (ocfs2_mount_local(osb)) goto bail; @@ -2330,7 +2244,6 @@ static int ocfs2_inode_lock_update(struct inode *inode, bail_refresh: ocfs2_complete_lock_res_refresh(lockres, status); bail: - mlog_exit(status); return status; } @@ -2374,8 +2287,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, BUG_ON(!inode); - mlog_entry_void(); - mlog(0, "inode %llu, take %s META lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); @@ -2467,7 +2378,6 @@ bail: if (local_bh) brelse(local_bh); - mlog_exit(status); return status; } @@ -2517,7 +2427,6 @@ int ocfs2_inode_lock_atime(struct inode *inode, { int ret; - mlog_entry_void(); ret = ocfs2_inode_lock(inode, NULL, 0); if (ret < 0) { mlog_errno(ret); @@ -2545,7 +2454,6 @@ int ocfs2_inode_lock_atime(struct inode *inode, } else *level = 0; - mlog_exit(ret); return ret; } @@ -2556,8 +2464,6 @@ void ocfs2_inode_unlock(struct inode *inode, struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry_void(); - mlog(0, "inode %llu drop %s META lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); @@ -2565,8 +2471,6 @@ void ocfs2_inode_unlock(struct inode *inode, if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); - - mlog_exit_void(); } int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno) @@ -2617,8 +2521,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb, int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; - mlog_entry_void(); - if (ocfs2_is_hard_readonly(osb)) return -EROFS; @@ -2650,7 +2552,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb, ocfs2_track_lock_refresh(lockres); } bail: - mlog_exit(status); return status; } @@ -2869,8 +2770,15 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) return iter; } -/* So that debugfs.ocfs2 can determine which format is being used */ -#define OCFS2_DLM_DEBUG_STR_VERSION 2 +/* + * Version is used by debugfs.ocfs2 to determine the format being used + * + * New in version 2 + * - Lock stats printed + * New in version 3 + * - Max time in lock stats is in usecs (instead of nsecs) + */ +#define OCFS2_DLM_DEBUG_STR_VERSION 3 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) { int i; @@ -2912,18 +2820,18 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) seq_printf(m, "0x%x\t", lvb[i]); #ifdef CONFIG_OCFS2_FS_STATS -# define lock_num_prmode(_l) (_l)->l_lock_num_prmode -# define lock_num_exmode(_l) (_l)->l_lock_num_exmode -# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed -# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed -# define lock_total_prmode(_l) (_l)->l_lock_total_prmode -# define lock_total_exmode(_l) (_l)->l_lock_total_exmode -# define lock_max_prmode(_l) (_l)->l_lock_max_prmode -# define lock_max_exmode(_l) (_l)->l_lock_max_exmode -# define lock_refresh(_l) (_l)->l_lock_refresh +# define lock_num_prmode(_l) ((_l)->l_lock_prmode.ls_gets) +# define lock_num_exmode(_l) ((_l)->l_lock_exmode.ls_gets) +# define lock_num_prmode_failed(_l) ((_l)->l_lock_prmode.ls_fail) +# define lock_num_exmode_failed(_l) ((_l)->l_lock_exmode.ls_fail) +# define lock_total_prmode(_l) ((_l)->l_lock_prmode.ls_total) +# define lock_total_exmode(_l) ((_l)->l_lock_exmode.ls_total) +# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max) +# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max) +# define lock_refresh(_l) ((_l)->l_lock_refresh) #else -# define lock_num_prmode(_l) (0ULL) -# define lock_num_exmode(_l) (0ULL) +# define lock_num_prmode(_l) (0) +# define lock_num_exmode(_l) (0) # define lock_num_prmode_failed(_l) (0) # define lock_num_exmode_failed(_l) (0) # define lock_total_prmode(_l) (0ULL) @@ -2933,8 +2841,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_refresh(_l) (0) #endif /* The following seq_print was added in version 2 of this output */ - seq_printf(m, "%llu\t" - "%llu\t" + seq_printf(m, "%u\t" + "%u\t" "%u\t" "%u\t" "%llu\t" @@ -3054,8 +2962,6 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) int status = 0; struct ocfs2_cluster_connection *conn = NULL; - mlog_entry_void(); - if (ocfs2_mount_local(osb)) { osb->node_num = 0; goto local; @@ -3112,15 +3018,12 @@ bail: kthread_stop(osb->dc_task); } - mlog_exit(status); return status; } void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending) { - mlog_entry_void(); - ocfs2_drop_osb_locks(osb); /* @@ -3143,8 +3046,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, osb->cconn = NULL; ocfs2_dlm_shutdown_debug(osb); - - mlog_exit_void(); } static int ocfs2_drop_lock(struct ocfs2_super *osb, @@ -3226,7 +3127,6 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, ocfs2_wait_on_busy_lock(lockres); out: - mlog_exit(0); return 0; } @@ -3284,8 +3184,6 @@ int ocfs2_drop_inode_locks(struct inode *inode) { int status, err; - mlog_entry_void(); - /* No need to call ocfs2_mark_lockres_freeing here - * ocfs2_clear_inode has done it for us. */ @@ -3310,7 +3208,6 @@ int ocfs2_drop_inode_locks(struct inode *inode) if (err < 0 && !status) status = err; - mlog_exit(status); return status; } @@ -3352,8 +3249,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, int ret; u32 dlm_flags = DLM_LKF_CONVERT; - mlog_entry_void(); - mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, lockres->l_level, new_level); @@ -3375,7 +3270,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, ret = 0; bail: - mlog_exit(ret); return ret; } @@ -3385,8 +3279,6 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, { assert_spin_locked(&lockres->l_lock); - mlog_entry_void(); - if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { /* If we're already trying to cancel a lock conversion * then just drop the spinlock and allow the caller to @@ -3416,8 +3308,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb, { int ret; - mlog_entry_void(); - ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, DLM_LKF_CANCEL); if (ret) { @@ -3427,7 +3317,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb, mlog(ML_BASTS, "lockres %s\n", lockres->l_name); - mlog_exit(ret); return ret; } @@ -3443,8 +3332,6 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb, int set_lvb = 0; unsigned int gen; - mlog_entry_void(); - spin_lock_irqsave(&lockres->l_lock, flags); recheck: @@ -3619,14 +3506,14 @@ downconvert: gen); leave: - mlog_exit(ret); + if (ret) + mlog_errno(ret); return ret; leave_requeue: spin_unlock_irqrestore(&lockres->l_lock, flags); ctl->requeue = 1; - mlog_exit(0); return 0; } @@ -3859,8 +3746,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, oinfo->dqi_gi.dqi_type); - mlog_entry_void(); - lvb = ocfs2_dlm_lvb(&lockres->l_lksb); lvb->lvb_version = OCFS2_QINFO_LVB_VERSION; lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace); @@ -3869,8 +3754,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks); lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk); lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry); - - mlog_exit_void(); } void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex) @@ -3879,10 +3762,8 @@ void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex) struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; - mlog_entry_void(); if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(osb, lockres, level); - mlog_exit_void(); } static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo) @@ -3937,8 +3818,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex) int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; int status = 0; - mlog_entry_void(); - /* On RO devices, locking really isn't needed... */ if (ocfs2_is_hard_readonly(osb)) { if (ex) @@ -3961,7 +3840,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex) ocfs2_qinfo_unlock(oinfo, ex); ocfs2_complete_lock_res_refresh(lockres, status); bail: - mlog_exit(status); return status; } @@ -4007,8 +3885,6 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, * considered valid until we remove the OCFS2_LOCK_QUEUED * flag. */ - mlog_entry_void(); - BUG_ON(!lockres); BUG_ON(!lockres->l_ops); @@ -4042,15 +3918,11 @@ unqueue: if (ctl.unblock_action != UNBLOCK_CONTINUE && lockres->l_ops->post_unlock) lockres->l_ops->post_unlock(osb, lockres); - - mlog_exit_void(); } static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres) { - mlog_entry_void(); - assert_spin_locked(&lockres->l_lock); if (lockres->l_flags & OCFS2_LOCK_FREEING) { @@ -4071,8 +3943,6 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, osb->blocked_lock_count++; } spin_unlock(&osb->dc_task_lock); - - mlog_exit_void(); } static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) @@ -4080,8 +3950,6 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) unsigned long processed; struct ocfs2_lock_res *lockres; - mlog_entry_void(); - spin_lock(&osb->dc_task_lock); /* grab this early so we know to try again if a state change and * wake happens part-way through our work */ @@ -4105,8 +3973,6 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) spin_lock(&osb->dc_task_lock); } spin_unlock(&osb->dc_task_lock); - - mlog_exit_void(); } static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb) diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 254652a9b54..745db42528d 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/types.h> -#define MLOG_MASK_PREFIX ML_EXPORT #include <cluster/masklog.h> #include "ocfs2.h" @@ -40,6 +39,7 @@ #include "buffer_head_io.h" #include "suballoc.h" +#include "ocfs2_trace.h" struct ocfs2_inode_handle { @@ -56,10 +56,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, int status, set; struct dentry *result; - mlog_entry("(0x%p, 0x%p)\n", sb, handle); + trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno); if (blkno == 0) { - mlog(0, "nfs wants inode with blkno: 0\n"); result = ERR_PTR(-ESTALE); goto bail; } @@ -83,6 +82,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, } status = ocfs2_test_inode_bit(osb, blkno, &set); + trace_ocfs2_get_dentry_test_bit(status, set); if (status < 0) { if (status == -EINVAL) { /* @@ -90,18 +90,14 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, * as an inode, we return -ESTALE to be * nice */ - mlog(0, "test inode bit failed %d\n", status); status = -ESTALE; - } else { + } else mlog(ML_ERROR, "test inode bit failed %d\n", status); - } goto unlock_nfs_sync; } /* If the inode allocator bit is clear, this inode must be stale */ if (!set) { - mlog(0, "inode %llu suballoc bit is clear\n", - (unsigned long long)blkno); status = -ESTALE; goto unlock_nfs_sync; } @@ -114,8 +110,8 @@ unlock_nfs_sync: check_err: if (status < 0) { if (status == -ESTALE) { - mlog(0, "stale inode ino: %llu generation: %u\n", - (unsigned long long)blkno, handle->ih_generation); + trace_ocfs2_get_dentry_stale((unsigned long long)blkno, + handle->ih_generation); } result = ERR_PTR(status); goto bail; @@ -130,8 +126,9 @@ check_err: check_gen: if (handle->ih_generation != inode->i_generation) { iput(inode); - mlog(0, "stale inode ino: %llu generation: %u\n", - (unsigned long long)blkno, handle->ih_generation); + trace_ocfs2_get_dentry_generation((unsigned long long)blkno, + handle->ih_generation, + inode->i_generation); result = ERR_PTR(-ESTALE); goto bail; } @@ -141,7 +138,7 @@ check_gen: mlog_errno(PTR_ERR(result)); bail: - mlog_exit_ptr(result); + trace_ocfs2_get_dentry_end(result); return result; } @@ -152,11 +149,8 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) struct dentry *parent; struct inode *dir = child->d_inode; - mlog_entry("(0x%p, '%.*s')\n", child, - child->d_name.len, child->d_name.name); - - mlog(0, "find parent of directory %llu\n", - (unsigned long long)OCFS2_I(dir)->ip_blkno); + trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno); status = ocfs2_inode_lock(dir, NULL, 0); if (status < 0) { @@ -178,7 +172,7 @@ bail_unlock: ocfs2_inode_unlock(dir, 0); bail: - mlog_exit_ptr(parent); + trace_ocfs2_get_parent_end(parent); return parent; } @@ -193,9 +187,9 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, u32 generation; __le32 *fh = (__force __le32 *) fh_in; - mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry, - dentry->d_name.len, dentry->d_name.name, - fh, len, connectable); + trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len, + dentry->d_name.name, + fh, len, connectable); if (connectable && (len < 6)) { *max_len = 6; @@ -210,8 +204,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, blkno = OCFS2_I(inode)->ip_blkno; generation = inode->i_generation; - mlog(0, "Encoding fh: blkno: %llu, generation: %u\n", - (unsigned long long)blkno, generation); + trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation); len = 3; fh[0] = cpu_to_le32((u32)(blkno >> 32)); @@ -236,14 +229,14 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, len = 6; type = 2; - mlog(0, "Encoding parent: blkno: %llu, generation: %u\n", - (unsigned long long)blkno, generation); + trace_ocfs2_encode_fh_parent((unsigned long long)blkno, + generation); } *max_len = len; bail: - mlog_exit(type); + trace_ocfs2_encode_fh_type(type); return type; } diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 09e3fdfa6d3..23457b491e8 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -28,7 +28,6 @@ #include <linux/types.h> #include <linux/fiemap.h> -#define MLOG_MASK_PREFIX ML_EXTENT_MAP #include <cluster/masklog.h> #include "ocfs2.h" @@ -39,6 +38,7 @@ #include "inode.h" #include "super.h" #include "symlink.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -841,10 +841,9 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, u64 p_block, p_count; int i, count, done = 0; - mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, " - "flags = %x, validate = %p)\n", - inode, (unsigned long long)v_block, nr, bhs, flags, - validate); + trace_ocfs2_read_virt_blocks( + inode, (unsigned long long)v_block, nr, bhs, flags, + validate); if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >= i_size_read(inode)) { @@ -897,7 +896,6 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, } out: - mlog_exit(rc); return rc; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a6651956482..41565ae5285 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -38,7 +38,6 @@ #include <linux/quotaops.h> #include <linux/blkdev.h> -#define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> #include "ocfs2.h" @@ -61,6 +60,7 @@ #include "acl.h" #include "quota.h" #include "refcounttree.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -99,8 +99,10 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) int mode = file->f_flags; struct ocfs2_inode_info *oi = OCFS2_I(inode); - mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, - file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); + trace_ocfs2_file_open(inode, file, file->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, mode); if (file->f_mode & FMODE_WRITE) dquot_initialize(inode); @@ -135,7 +137,6 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) } leave: - mlog_exit(status); return status; } @@ -143,19 +144,19 @@ static int ocfs2_file_release(struct inode *inode, struct file *file) { struct ocfs2_inode_info *oi = OCFS2_I(inode); - mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, - file->f_path.dentry->d_name.len, - file->f_path.dentry->d_name.name); - spin_lock(&oi->ip_lock); if (!--oi->ip_open_count) oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; + + trace_ocfs2_file_release(inode, file, file->f_path.dentry, + oi->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, + oi->ip_open_count); spin_unlock(&oi->ip_lock); ocfs2_free_file_private(inode, file); - mlog_exit(0); - return 0; } @@ -177,9 +178,11 @@ static int ocfs2_sync_file(struct file *file, int datasync) struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync, - file->f_path.dentry, file->f_path.dentry->d_name.len, - file->f_path.dentry->d_name.name); + trace_ocfs2_sync_file(inode, file, file->f_path.dentry, + OCFS2_I(inode)->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, + (unsigned long long)datasync); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { /* @@ -195,7 +198,8 @@ static int ocfs2_sync_file(struct file *file, int datasync) err = jbd2_journal_force_commit(journal); bail: - mlog_exit(err); + if (err) + mlog_errno(err); return (err < 0) ? -EIO : 0; } @@ -251,8 +255,6 @@ int ocfs2_update_inode_atime(struct inode *inode, handle_t *handle; struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; - mlog_entry_void(); - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -280,7 +282,6 @@ int ocfs2_update_inode_atime(struct inode *inode, out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: - mlog_exit(ret); return ret; } @@ -291,7 +292,6 @@ static int ocfs2_set_inode_size(handle_t *handle, { int status; - mlog_entry_void(); i_size_write(inode, new_i_size); inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -303,7 +303,6 @@ static int ocfs2_set_inode_size(handle_t *handle, } bail: - mlog_exit(status); return status; } @@ -375,8 +374,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, struct ocfs2_dinode *di; u64 cluster_bytes; - mlog_entry_void(); - /* * We need to CoW the cluster contains the offset if it is reflinked * since we will call ocfs2_zero_range_for_truncate later which will @@ -429,8 +426,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, out_commit: ocfs2_commit_trans(osb, handle); out: - - mlog_exit(status); return status; } @@ -442,14 +437,14 @@ static int ocfs2_truncate_file(struct inode *inode, struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry("(inode = %llu, new_i_size = %llu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)new_i_size); - /* We trust di_bh because it comes from ocfs2_inode_lock(), which * already validated it */ fe = (struct ocfs2_dinode *) di_bh->b_data; + trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(fe->i_size), + (unsigned long long)new_i_size); + mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), "Inode %llu, inode i_size = %lld != di " "i_size = %llu, i_flags = 0x%x\n", @@ -459,19 +454,14 @@ static int ocfs2_truncate_file(struct inode *inode, le32_to_cpu(fe->i_flags)); if (new_i_size > le64_to_cpu(fe->i_size)) { - mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", - (unsigned long long)le64_to_cpu(fe->i_size), - (unsigned long long)new_i_size); + trace_ocfs2_truncate_file_error( + (unsigned long long)le64_to_cpu(fe->i_size), + (unsigned long long)new_i_size); status = -EINVAL; mlog_errno(status); goto bail; } - mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", - (unsigned long long)le64_to_cpu(fe->i_blkno), - (unsigned long long)le64_to_cpu(fe->i_size), - (unsigned long long)new_i_size); - /* lets handle the simple truncate cases before doing any more * cluster locking. */ if (new_i_size == le64_to_cpu(fe->i_size)) @@ -525,7 +515,6 @@ bail: if (!status && OCFS2_I(inode)->ip_clusters == 0) status = ocfs2_try_remove_refcount_tree(inode, di_bh); - mlog_exit(status); return status; } @@ -578,8 +567,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, struct ocfs2_extent_tree et; int did_quota = 0; - mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); - /* * This function only exists for file systems which don't * support holes. @@ -596,11 +583,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " - "clusters_to_add = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), - clusters_to_add); ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, &data_ac, &meta_ac); @@ -620,6 +602,12 @@ restart_all: } restarted_transaction: + trace_ocfs2_extend_allocation( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)i_size_read(inode), + le32_to_cpu(fe->i_clusters), clusters_to_add, + why, restart_func); + status = dquot_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); if (status) @@ -666,13 +654,11 @@ restarted_transaction: if (why != RESTART_NONE && clusters_to_add) { if (why == RESTART_META) { - mlog(0, "restarting function.\n"); restart_func = 1; status = 0; } else { BUG_ON(why != RESTART_TRANS); - mlog(0, "restarting transaction.\n"); /* TODO: This can be more intelligent. */ credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, @@ -689,11 +675,11 @@ restarted_transaction: } } - mlog(0, "fe: i_clusters = %u, i_size=%llu\n", + trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno, le32_to_cpu(fe->i_clusters), - (unsigned long long)le64_to_cpu(fe->i_size)); - mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", - OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); + (unsigned long long)le64_to_cpu(fe->i_size), + OCFS2_I(inode)->ip_clusters, + (unsigned long long)i_size_read(inode)); leave: if (status < 0 && did_quota) @@ -718,7 +704,6 @@ leave: brelse(bh); bh = NULL; - mlog_exit(status); return status; } @@ -785,10 +770,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, if (!zero_to) zero_to = PAGE_CACHE_SIZE; - mlog(0, - "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n", - (unsigned long long)abs_from, (unsigned long long)abs_to, - index, zero_from, zero_to); + trace_ocfs2_write_zero_page( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)abs_from, + (unsigned long long)abs_to, + index, zero_from, zero_to); /* We know that zero_from is block aligned */ for (block_start = zero_from; block_start < zero_to; @@ -928,9 +914,10 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, u64 next_pos; u64 zero_pos = range_start; - mlog(0, "range_start = %llu, range_end = %llu\n", - (unsigned long long)range_start, - (unsigned long long)range_end); + trace_ocfs2_zero_extend_range( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)range_start, + (unsigned long long)range_end); BUG_ON(range_start >= range_end); while (zero_pos < range_end) { @@ -962,9 +949,9 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, struct super_block *sb = inode->i_sb; zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); - mlog(0, "zero_start %llu for i_size %llu\n", - (unsigned long long)zero_start, - (unsigned long long)i_size_read(inode)); + trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)zero_start, + (unsigned long long)i_size_read(inode)); while (zero_start < zero_to_size) { ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, zero_to_size, @@ -1113,30 +1100,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) struct dquot *transfer_to[MAXQUOTAS] = { }; int qtype; - mlog_entry("(0x%p, '%.*s')\n", dentry, - dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_setattr(inode, dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + dentry->d_name.len, dentry->d_name.name, + attr->ia_valid, attr->ia_mode, + attr->ia_uid, attr->ia_gid); /* ensuring we don't even attempt to truncate a symlink */ if (S_ISLNK(inode->i_mode)) attr->ia_valid &= ~ATTR_SIZE; - if (attr->ia_valid & ATTR_MODE) - mlog(0, "mode change: %d\n", attr->ia_mode); - if (attr->ia_valid & ATTR_UID) - mlog(0, "uid change: %d\n", attr->ia_uid); - if (attr->ia_valid & ATTR_GID) - mlog(0, "gid change: %d\n", attr->ia_gid); - if (attr->ia_valid & ATTR_SIZE) - mlog(0, "size change...\n"); - if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) - mlog(0, "time change...\n"); - #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ | ATTR_GID | ATTR_UID | ATTR_MODE) - if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { - mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); + if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - } status = inode_change_ok(inode, attr); if (status) @@ -1274,7 +1251,6 @@ bail: mlog_errno(status); } - mlog_exit(status); return status; } @@ -1287,8 +1263,6 @@ int ocfs2_getattr(struct vfsmount *mnt, struct ocfs2_super *osb = sb->s_fs_info; int err; - mlog_entry_void(); - err = ocfs2_inode_revalidate(dentry); if (err) { if (err != -ENOENT) @@ -1302,8 +1276,6 @@ int ocfs2_getattr(struct vfsmount *mnt, stat->blksize = osb->s_clustersize; bail: - mlog_exit(err); - return err; } @@ -1314,8 +1286,6 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) if (flags & IPERM_FLAG_RCU) return -ECHILD; - mlog_entry_void(); - ret = ocfs2_inode_lock(inode, NULL, 0); if (ret) { if (ret != -ENOENT) @@ -1327,7 +1297,6 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) ocfs2_inode_unlock(inode, 0); out: - mlog_exit(ret); return ret; } @@ -1339,8 +1308,9 @@ static int __ocfs2_write_remove_suid(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di; - mlog_entry("(Inode %llu, mode 0%o)\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); + trace_ocfs2_write_remove_suid( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + inode->i_mode); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { @@ -1368,7 +1338,6 @@ static int __ocfs2_write_remove_suid(struct inode *inode, out_trans: ocfs2_commit_trans(osb, handle); out: - mlog_exit(ret); return ret; } @@ -1547,8 +1516,9 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, * partial clusters here. There's no need to worry about * physical allocation - the zeroing code knows to skip holes. */ - mlog(0, "byte start: %llu, end: %llu\n", - (unsigned long long)start, (unsigned long long)end); + trace_ocfs2_zero_partial_clusters( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)start, (unsigned long long)end); /* * If both edges are on a cluster boundary then there's no @@ -1572,8 +1542,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, if (tmpend > end) tmpend = end; - mlog(0, "1st range: start: %llu, tmpend: %llu\n", - (unsigned long long)start, (unsigned long long)tmpend); + trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, + (unsigned long long)tmpend); ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); if (ret) @@ -1587,8 +1557,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, */ start = end & ~(osb->s_clustersize - 1); - mlog(0, "2nd range: start: %llu, end: %llu\n", - (unsigned long long)start, (unsigned long long)end); + trace_ocfs2_zero_partial_clusters_range2( + (unsigned long long)start, (unsigned long long)end); ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); if (ret) @@ -1688,6 +1658,11 @@ static int ocfs2_remove_inode_range(struct inode *inode, ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&dealloc); + trace_ocfs2_remove_inode_range( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)byte_start, + (unsigned long long)byte_len); + if (byte_len == 0) return 0; @@ -1734,11 +1709,6 @@ static int ocfs2_remove_inode_range(struct inode *inode, trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; cluster_in_el = trunc_end; - mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)byte_start, - (unsigned long long)byte_len, trunc_start, trunc_end); - ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); if (ret) { mlog_errno(ret); @@ -2093,7 +2063,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, int ret = 0, meta_level = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; - loff_t saved_pos, end; + loff_t saved_pos = 0, end; /* * We start with a read level meta lock and only jump to an ex @@ -2132,12 +2102,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file, /* work on a copy of ppos until we're sure that we won't have * to recalculate it due to relocking. */ - if (appending) { + if (appending) saved_pos = i_size_read(inode); - mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); - } else { + else saved_pos = *ppos; - } end = saved_pos + count; @@ -2208,6 +2176,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file, *ppos = saved_pos; out_unlock: + trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, + saved_pos, appending, count, + direct_io, has_refcount); + if (meta_level >= 0) ocfs2_inode_unlock(inode, meta_level); @@ -2233,10 +2205,11 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); - mlog_entry("(0x%p, %u, '%.*s')\n", file, - (unsigned int)nr_segs, - file->f_path.dentry->d_name.len, - file->f_path.dentry->d_name.name); + trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name, + (unsigned int)nr_segs); if (iocb->ki_left == 0) return 0; @@ -2402,7 +2375,6 @@ out_sems: if (written) ret = written; - mlog_exit(ret); return ret; } @@ -2438,10 +2410,11 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, .u.file = out, }; - mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, - (unsigned int)len, - out->f_path.dentry->d_name.len, - out->f_path.dentry->d_name.name); + + trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + out->f_path.dentry->d_name.len, + out->f_path.dentry->d_name.name, len); if (pipe->inode) mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); @@ -2485,7 +2458,6 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } - mlog_exit(ret); return ret; } @@ -2498,10 +2470,10 @@ static ssize_t ocfs2_file_splice_read(struct file *in, int ret = 0, lock_level = 0; struct inode *inode = in->f_path.dentry->d_inode; - mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, - (unsigned int)len, - in->f_path.dentry->d_name.len, - in->f_path.dentry->d_name.name); + trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + in->f_path.dentry->d_name.len, + in->f_path.dentry->d_name.name, len); /* * See the comment in ocfs2_file_aio_read() @@ -2516,7 +2488,6 @@ static ssize_t ocfs2_file_splice_read(struct file *in, ret = generic_file_splice_read(in, ppos, pipe, len, flags); bail: - mlog_exit(ret); return ret; } @@ -2529,10 +2500,11 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, struct file *filp = iocb->ki_filp; struct inode *inode = filp->f_path.dentry->d_inode; - mlog_entry("(0x%p, %u, '%.*s')\n", filp, - (unsigned int)nr_segs, - filp->f_path.dentry->d_name.len, - filp->f_path.dentry->d_name.name); + trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + filp->f_path.dentry->d_name.len, + filp->f_path.dentry->d_name.name, nr_segs); + if (!inode) { ret = -EINVAL; @@ -2578,8 +2550,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, ocfs2_inode_unlock(inode, lock_level); ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); - if (ret == -EINVAL) - mlog(0, "generic_file_aio_read returned -EINVAL\n"); + trace_generic_file_aio_read_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); @@ -2597,7 +2568,6 @@ bail: } if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); - mlog_exit(ret); return ret; } diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 1aa863dd901..d8208b20dc5 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -28,7 +28,6 @@ #include <linux/types.h> #include <linux/highmem.h> -#define MLOG_MASK_PREFIX ML_SUPER #include <cluster/masklog.h> #include "ocfs2.h" @@ -37,6 +36,7 @@ #include "heartbeat.h" #include "inode.h" #include "journal.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -66,7 +66,7 @@ void ocfs2_do_node_down(int node_num, void *data) BUG_ON(osb->node_num == node_num); - mlog(0, "ocfs2: node down event for %d\n", node_num); + trace_ocfs2_do_node_down(node_num); if (!osb->cconn) { /* diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 4068c6c4c6f..177d3a6c2a5 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -31,7 +31,6 @@ #include <asm/byteorder.h> -#define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> #include "ocfs2.h" @@ -53,6 +52,7 @@ #include "uptodate.h" #include "xattr.h" #include "refcounttree.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -131,7 +131,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; - mlog_entry("(blkno = %llu)\n", (unsigned long long)blkno); + trace_ocfs2_iget_begin((unsigned long long)blkno, flags, + sysfile_type); /* Ok. By now we've either got the offsets passed to us by the * caller, or we just pulled them off the bh. Lets do some @@ -152,16 +153,16 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, /* inode was *not* in the inode cache. 2.6.x requires * us to do our own read_inode call and unlock it * afterwards. */ - if (inode && inode->i_state & I_NEW) { - mlog(0, "Inode was not in inode cache, reading it.\n"); - ocfs2_read_locked_inode(inode, &args); - unlock_new_inode(inode); - } if (inode == NULL) { inode = ERR_PTR(-ENOMEM); mlog_errno(PTR_ERR(inode)); goto bail; } + trace_ocfs2_iget5_locked(inode->i_state); + if (inode->i_state & I_NEW) { + ocfs2_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } if (is_bad_inode(inode)) { iput(inode); inode = ERR_PTR(-ESTALE); @@ -170,9 +171,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, bail: if (!IS_ERR(inode)) { - mlog(0, "returning inode with number %llu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - mlog_exit_ptr(inode); + trace_ocfs2_iget_end(inode, + (unsigned long long)OCFS2_I(inode)->ip_blkno); } return inode; @@ -192,18 +192,17 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque) struct ocfs2_inode_info *oi = OCFS2_I(inode); int ret = 0; - mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque); - args = opaque; mlog_bug_on_msg(!inode, "No inode in find actor!\n"); + trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno); + if (oi->ip_blkno != args->fi_blkno) goto bail; ret = 1; bail: - mlog_exit(ret); return ret; } @@ -218,8 +217,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, ocfs2_file_ip_alloc_sem_key; - mlog_entry("inode = %p, opaque = %p\n", inode, opaque); - inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; if (args->fi_sysfile_type != 0) @@ -235,7 +232,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_file_ip_alloc_sem_key); - mlog_exit(0); return 0; } @@ -246,9 +242,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, struct ocfs2_super *osb; int use_plocks = 1; - mlog_entry("(0x%p, size:%llu)\n", inode, - (unsigned long long)le64_to_cpu(fe->i_size)); - sb = inode->i_sb; osb = OCFS2_SB(sb); @@ -300,20 +293,20 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_nlink = ocfs2_read_links_count(fe); + trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, + le32_to_cpu(fe->i_flags)); if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; inode->i_flags |= S_NOQUOTA; } - + if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; - mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { inode->i_flags |= S_NOQUOTA; } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { - mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); /* we can't actually hit this as read_inode can't * handle superblocks today ;-) */ BUG(); @@ -381,7 +374,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, if (S_ISDIR(inode->i_mode)) ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, OCFS2_RESV_FLAG_DIR); - mlog_exit_void(); } static int ocfs2_read_locked_inode(struct inode *inode, @@ -394,8 +386,6 @@ static int ocfs2_read_locked_inode(struct inode *inode, int status, can_lock; u32 generation = 0; - mlog_entry("(0x%p, 0x%p)\n", inode, args); - status = -EINVAL; if (inode == NULL || inode->i_sb == NULL) { mlog(ML_ERROR, "bad inode\n"); @@ -443,6 +433,9 @@ static int ocfs2_read_locked_inode(struct inode *inode, && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) && !ocfs2_mount_local(osb); + trace_ocfs2_read_locked_inode( + (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock); + /* * To maintain backwards compatibility with older versions of * ocfs2-tools, we still store the generation value for system @@ -534,7 +527,6 @@ bail: if (args && bh) brelse(bh); - mlog_exit(status); return status; } @@ -551,8 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct ocfs2_dinode *fe; handle_t *handle = NULL; - mlog_entry_void(); - fe = (struct ocfs2_dinode *) fe_bh->b_data; /* @@ -600,7 +590,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, out: if (handle) ocfs2_commit_trans(osb, handle); - mlog_exit(status); return status; } @@ -696,8 +685,6 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, spin_lock(&osb->osb_lock); if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { - mlog(0, "Recovery is happening on orphan dir %d, will skip " - "this inode\n", slot); ret = -EDEADLK; goto out; } @@ -706,6 +693,7 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, osb->osb_orphan_wipes[slot]++; out: spin_unlock(&osb->osb_lock); + trace_ocfs2_check_orphan_recovery_state(slot, ret); return ret; } @@ -816,6 +804,10 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task, + (unsigned long long)oi->ip_blkno, + oi->ip_flags); + /* We shouldn't be getting here for the root directory * inode.. */ if (inode == osb->root_inode) { @@ -828,11 +820,8 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) * have to skip deleting this guy. That's OK though because * the node who's doing the actual deleting should handle it * anyway. */ - if (current == osb->dc_task) { - mlog(0, "Skipping delete of %lu because we're currently " - "in downconvert\n", inode->i_ino); + if (current == osb->dc_task) goto bail; - } spin_lock(&oi->ip_lock); /* OCFS2 *never* deletes system files. This should technically @@ -847,11 +836,8 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) /* If we have allowd wipe of this inode for another node, it * will be marked here so we can safely skip it. Recovery will * cleanup any inodes we might inadvertantly skip here. */ - if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { - mlog(0, "Skipping delete of %lu because another node " - "has done this for us.\n", inode->i_ino); + if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) goto bail_unlock; - } ret = 1; bail_unlock: @@ -868,28 +854,27 @@ static int ocfs2_query_inode_wipe(struct inode *inode, struct buffer_head *di_bh, int *wipe) { - int status = 0; + int status = 0, reason = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di; *wipe = 0; + trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno, + inode->i_nlink); + /* While we were waiting for the cluster lock in * ocfs2_delete_inode, another node might have asked to delete * the inode. Recheck our flags to catch this. */ if (!ocfs2_inode_is_valid_to_delete(inode)) { - mlog(0, "Skipping delete of %llu because flags changed\n", - (unsigned long long)oi->ip_blkno); + reason = 1; goto bail; } /* Now that we have an up to date inode, we can double check * the link count. */ - if (inode->i_nlink) { - mlog(0, "Skipping delete of %llu because nlink = %u\n", - (unsigned long long)oi->ip_blkno, inode->i_nlink); + if (inode->i_nlink) goto bail; - } /* Do some basic inode verification... */ di = (struct ocfs2_dinode *) di_bh->b_data; @@ -904,9 +889,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode, * ORPHANED_FL not. */ if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) { - mlog(0, "Reflinked inode %llu is no longer orphaned. " - "it shouldn't be deleted\n", - (unsigned long long)oi->ip_blkno); + reason = 2; goto bail; } @@ -943,8 +926,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode, status = ocfs2_try_open_lock(inode, 1); if (status == -EAGAIN) { status = 0; - mlog(0, "Skipping delete of %llu because it is in use on " - "other nodes\n", (unsigned long long)oi->ip_blkno); + reason = 3; goto bail; } if (status < 0) { @@ -953,11 +935,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode, } *wipe = 1; - mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n", - (unsigned long long)oi->ip_blkno, - le16_to_cpu(di->i_orphaned_slot)); + trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot)); bail: + trace_ocfs2_query_inode_wipe_end(status, reason); return status; } @@ -967,8 +948,8 @@ bail: static void ocfs2_cleanup_delete_inode(struct inode *inode, int sync_data) { - mlog(0, "Cleanup inode %llu, sync = %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); + trace_ocfs2_cleanup_delete_inode( + (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) write_inode_now(inode, 1); truncate_inode_pages(&inode->i_data, 0); @@ -980,15 +961,15 @@ static void ocfs2_delete_inode(struct inode *inode) sigset_t oldset; struct buffer_head *di_bh = NULL; - mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); + trace_ocfs2_delete_inode(inode->i_ino, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + is_bad_inode(inode)); /* When we fail in read_inode() we mark inode as bad. The second test * catches the case when inode allocation fails before allocating * a block for inode. */ - if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) { - mlog(0, "Skipping delete of bad inode\n"); + if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) goto bail; - } dquot_initialize(inode); @@ -1080,7 +1061,7 @@ bail_unlock_nfs_sync: bail_unblock: ocfs2_unblock_signals(&oldset); bail: - mlog_exit_void(); + return; } static void ocfs2_clear_inode(struct inode *inode) @@ -1088,11 +1069,9 @@ static void ocfs2_clear_inode(struct inode *inode) int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); - mlog_entry_void(); - end_writeback(inode); - mlog(0, "Clearing inode: %llu, nlink = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink); + trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, + inode->i_nlink); mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); @@ -1181,8 +1160,6 @@ static void ocfs2_clear_inode(struct inode *inode) */ jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, &oi->ip_jinode); - - mlog_exit_void(); } void ocfs2_evict_inode(struct inode *inode) @@ -1204,17 +1181,14 @@ int ocfs2_drop_inode(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); int res; - mlog_entry_void(); - - mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", - (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); + trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, + inode->i_nlink, oi->ip_flags); if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) res = 1; else res = generic_drop_inode(inode); - mlog_exit_void(); return res; } @@ -1226,11 +1200,11 @@ int ocfs2_inode_revalidate(struct dentry *dentry) struct inode *inode = dentry->d_inode; int status = 0; - mlog_entry("(inode = 0x%p, ino = %llu)\n", inode, - inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL); + trace_ocfs2_inode_revalidate(inode, + inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL, + inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0); if (!inode) { - mlog(0, "eep, no inode!\n"); status = -ENOENT; goto bail; } @@ -1238,7 +1212,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry) spin_lock(&OCFS2_I(inode)->ip_lock); if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { spin_unlock(&OCFS2_I(inode)->ip_lock); - mlog(0, "inode deleted!\n"); status = -ENOENT; goto bail; } @@ -1254,8 +1227,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry) } ocfs2_inode_unlock(inode, 0); bail: - mlog_exit(status); - return status; } @@ -1271,8 +1242,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle, int status; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; - mlog_entry("(inode %llu)\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno); status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1302,7 +1272,6 @@ int ocfs2_mark_inode_dirty(handle_t *handle, ocfs2_journal_dirty(handle, bh); leave: - mlog_exit(status); return status; } @@ -1345,8 +1314,7 @@ int ocfs2_validate_inode_block(struct super_block *sb, int rc; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; - mlog(0, "Validating dinode %llu\n", - (unsigned long long)bh->b_blocknr); + trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 09de77ce002..8f13c5989ea 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -9,7 +9,6 @@ #include <linux/mount.h> #include <linux/compat.h> -#define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> #include "ocfs2.h" @@ -46,6 +45,22 @@ static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, #define o2info_set_request_error(a, b) \ __o2info_set_request_error((struct ocfs2_info_request *)&(a), b) +static inline void __o2info_set_request_filled(struct ocfs2_info_request *req) +{ + req->ir_flags |= OCFS2_INFO_FL_FILLED; +} + +#define o2info_set_request_filled(a) \ + __o2info_set_request_filled((struct ocfs2_info_request *)&(a)) + +static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req) +{ + req->ir_flags &= ~OCFS2_INFO_FL_FILLED; +} + +#define o2info_clear_request_filled(a) \ + __o2info_clear_request_filled((struct ocfs2_info_request *)&(a)) + static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) { int status; @@ -59,7 +74,6 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) *flags = OCFS2_I(inode)->ip_attr; ocfs2_inode_unlock(inode, 0); - mlog_exit(status); return status; } @@ -125,7 +139,6 @@ bail: brelse(bh); - mlog_exit(status); return status; } @@ -139,7 +152,8 @@ int ocfs2_info_handle_blocksize(struct inode *inode, goto bail; oib.ib_blocksize = inode->i_sb->s_blocksize; - oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + o2info_set_request_filled(oib); if (o2info_to_user(oib, req)) goto bail; @@ -163,7 +177,8 @@ int ocfs2_info_handle_clustersize(struct inode *inode, goto bail; oic.ic_clustersize = osb->s_clustersize; - oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + o2info_set_request_filled(oic); if (o2info_to_user(oic, req)) goto bail; @@ -187,7 +202,8 @@ int ocfs2_info_handle_maxslots(struct inode *inode, goto bail; oim.im_max_slots = osb->max_slots; - oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + o2info_set_request_filled(oim); if (o2info_to_user(oim, req)) goto bail; @@ -211,7 +227,8 @@ int ocfs2_info_handle_label(struct inode *inode, goto bail; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); - oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + o2info_set_request_filled(oil); if (o2info_to_user(oil, req)) goto bail; @@ -235,7 +252,8 @@ int ocfs2_info_handle_uuid(struct inode *inode, goto bail; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); - oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + o2info_set_request_filled(oiu); if (o2info_to_user(oiu, req)) goto bail; @@ -261,7 +279,8 @@ int ocfs2_info_handle_fs_features(struct inode *inode, oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; oif.if_ro_compat_features = osb->s_feature_ro_compat; - oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + o2info_set_request_filled(oif); if (o2info_to_user(oif, req)) goto bail; @@ -286,7 +305,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode, oij.ij_journal_size = osb->journal->j_inode->i_size; - oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED; + o2info_set_request_filled(oij); if (o2info_to_user(oij, req)) goto bail; @@ -308,7 +327,7 @@ int ocfs2_info_handle_unknown(struct inode *inode, if (o2info_from_user(oir, req)) goto bail; - oir.ir_flags &= ~OCFS2_INFO_FL_FILLED; + o2info_clear_request_filled(oir); if (o2info_to_user(oir, req)) goto bail; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index faa2303dbf0..dcc2d932715 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -31,7 +31,6 @@ #include <linux/time.h> #include <linux/random.h> -#define MLOG_MASK_PREFIX ML_JOURNAL #include <cluster/masklog.h> #include "ocfs2.h" @@ -52,6 +51,7 @@ #include "quota.h" #include "buffer_head_io.h" +#include "ocfs2_trace.h" DEFINE_SPINLOCK(trans_inc_lock); @@ -303,16 +303,15 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) unsigned int flushed; struct ocfs2_journal *journal = NULL; - mlog_entry_void(); - journal = osb->journal; /* Flush all pending commits and checkpoint the journal. */ down_write(&journal->j_trans_barrier); - if (atomic_read(&journal->j_num_trans) == 0) { + flushed = atomic_read(&journal->j_num_trans); + trace_ocfs2_commit_cache_begin(flushed); + if (flushed == 0) { up_write(&journal->j_trans_barrier); - mlog(0, "No transactions for me to flush!\n"); goto finally; } @@ -331,13 +330,11 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) atomic_set(&journal->j_num_trans, 0); up_write(&journal->j_trans_barrier); - mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", - journal->j_trans_id, flushed); + trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed); ocfs2_wake_downconvert_thread(osb); wake_up(&journal->j_checkpointed); finally: - mlog_exit(status); return status; } @@ -425,9 +422,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) return 0; old_nblocks = handle->h_buffer_credits; - mlog_entry_void(); - mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); + trace_ocfs2_extend_trans(old_nblocks, nblocks); #ifdef CONFIG_OCFS2_DEBUG_FS status = 1; @@ -440,9 +436,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) #endif if (status > 0) { - mlog(0, - "jbd2_journal_extend failed, trying " - "jbd2_journal_restart\n"); + trace_ocfs2_extend_trans_restart(old_nblocks + nblocks); status = jbd2_journal_restart(handle, old_nblocks + nblocks); if (status < 0) { @@ -453,8 +447,6 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) status = 0; bail: - - mlog_exit(status); return status; } @@ -622,12 +614,9 @@ static int __ocfs2_journal_access(handle_t *handle, BUG_ON(!handle); BUG_ON(!bh); - mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n", - (unsigned long long)bh->b_blocknr, type, - (type == OCFS2_JOURNAL_ACCESS_CREATE) ? - "OCFS2_JOURNAL_ACCESS_CREATE" : - "OCFS2_JOURNAL_ACCESS_WRITE", - bh->b_size); + trace_ocfs2_journal_access( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)bh->b_blocknr, type, bh->b_size); /* we can safely remove this assertion after testing. */ if (!buffer_uptodate(bh)) { @@ -668,7 +657,6 @@ static int __ocfs2_journal_access(handle_t *handle, mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", status, type); - mlog_exit(status); return status; } @@ -737,13 +725,10 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) { int status; - mlog_entry("(bh->b_blocknr=%llu)\n", - (unsigned long long)bh->b_blocknr); + trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); status = jbd2_journal_dirty_metadata(handle, bh); BUG_ON(status); - - mlog_exit_void(); } #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) @@ -775,8 +760,6 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) struct ocfs2_super *osb; int inode_lock = 0; - mlog_entry_void(); - BUG_ON(!journal); osb = journal->j_osb; @@ -820,10 +803,9 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) goto done; } - mlog(0, "inode->i_size = %lld\n", inode->i_size); - mlog(0, "inode->i_blocks = %llu\n", - (unsigned long long)inode->i_blocks); - mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); + trace_ocfs2_journal_init(inode->i_size, + (unsigned long long)inode->i_blocks, + OCFS2_I(inode)->ip_clusters); /* call the kernels journal init function now */ j_journal = jbd2_journal_init_inode(inode); @@ -833,8 +815,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) goto done; } - mlog(0, "Returned from jbd2_journal_init_inode\n"); - mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); + trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & OCFS2_JOURNAL_DIRTY_FL); @@ -859,7 +840,6 @@ done: } } - mlog_exit(status); return status; } @@ -882,8 +862,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, struct buffer_head *bh = journal->j_bh; struct ocfs2_dinode *fe; - mlog_entry_void(); - fe = (struct ocfs2_dinode *)bh->b_data; /* The journal bh on the osb always comes from ocfs2_journal_init() @@ -906,7 +884,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, if (status < 0) mlog_errno(status); - mlog_exit(status); return status; } @@ -921,8 +898,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) struct inode *inode = NULL; int num_running_trans = 0; - mlog_entry_void(); - BUG_ON(!osb); journal = osb->journal; @@ -939,10 +914,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) BUG(); num_running_trans = atomic_read(&(osb->journal->j_num_trans)); - if (num_running_trans > 0) - mlog(0, "Shutting down journal: must wait on %d " - "running transactions!\n", - num_running_trans); + trace_ocfs2_journal_shutdown(num_running_trans); /* Do a commit_cache here. It will flush our journal, *and* * release any locks that are still held. @@ -955,7 +927,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) * completely destroy the journal. */ if (osb->commit_task) { /* Wait for the commit thread */ - mlog(0, "Waiting for ocfs2commit to exit....\n"); + trace_ocfs2_journal_shutdown_wait(osb->commit_task); kthread_stop(osb->commit_task); osb->commit_task = NULL; } @@ -998,7 +970,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) done: if (inode) iput(inode); - mlog_exit_void(); } static void ocfs2_clear_journal_error(struct super_block *sb, @@ -1024,8 +995,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) int status = 0; struct ocfs2_super *osb; - mlog_entry_void(); - BUG_ON(!journal); osb = journal->j_osb; @@ -1059,7 +1028,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) osb->commit_task = NULL; done: - mlog_exit(status); return status; } @@ -1070,8 +1038,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) { int status; - mlog_entry_void(); - BUG_ON(!journal); status = jbd2_journal_wipe(journal->j_journal, full); @@ -1085,7 +1051,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) mlog_errno(status); bail: - mlog_exit(status); return status; } @@ -1124,8 +1089,6 @@ static int ocfs2_force_read_journal(struct inode *inode) #define CONCURRENT_JOURNAL_FILL 32ULL struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; - mlog_entry_void(); - memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); @@ -1161,7 +1124,6 @@ static int ocfs2_force_read_journal(struct inode *inode) bail: for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) brelse(bhs[i]); - mlog_exit(status); return status; } @@ -1185,7 +1147,7 @@ struct ocfs2_la_recovery_item { */ void ocfs2_complete_recovery(struct work_struct *work) { - int ret; + int ret = 0; struct ocfs2_journal *journal = container_of(work, struct ocfs2_journal, j_recovery_work); struct ocfs2_super *osb = journal->j_osb; @@ -1194,9 +1156,8 @@ void ocfs2_complete_recovery(struct work_struct *work) struct ocfs2_quota_recovery *qrec; LIST_HEAD(tmp_la_list); - mlog_entry_void(); - - mlog(0, "completing recovery from keventd\n"); + trace_ocfs2_complete_recovery( + (unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno); spin_lock(&journal->j_lock); list_splice_init(&journal->j_la_cleanups, &tmp_la_list); @@ -1205,15 +1166,18 @@ void ocfs2_complete_recovery(struct work_struct *work) list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) { list_del_init(&item->lri_list); - mlog(0, "Complete recovery for slot %d\n", item->lri_slot); - ocfs2_wait_on_quotas(osb); la_dinode = item->lri_la_dinode; - if (la_dinode) { - mlog(0, "Clean up local alloc %llu\n", - (unsigned long long)le64_to_cpu(la_dinode->i_blkno)); + tl_dinode = item->lri_tl_dinode; + qrec = item->lri_qrec; + + trace_ocfs2_complete_recovery_slot(item->lri_slot, + la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, + tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0, + qrec); + if (la_dinode) { ret = ocfs2_complete_local_alloc_recovery(osb, la_dinode); if (ret < 0) @@ -1222,11 +1186,7 @@ void ocfs2_complete_recovery(struct work_struct *work) kfree(la_dinode); } - tl_dinode = item->lri_tl_dinode; if (tl_dinode) { - mlog(0, "Clean up truncate log %llu\n", - (unsigned long long)le64_to_cpu(tl_dinode->i_blkno)); - ret = ocfs2_complete_truncate_log_recovery(osb, tl_dinode); if (ret < 0) @@ -1239,9 +1199,7 @@ void ocfs2_complete_recovery(struct work_struct *work) if (ret < 0) mlog_errno(ret); - qrec = item->lri_qrec; if (qrec) { - mlog(0, "Recovering quota files"); ret = ocfs2_finish_quota_recovery(osb, qrec, item->lri_slot); if (ret < 0) @@ -1252,8 +1210,7 @@ void ocfs2_complete_recovery(struct work_struct *work) kfree(item); } - mlog(0, "Recovery completion\n"); - mlog_exit_void(); + trace_ocfs2_complete_recovery_end(ret); } /* NOTE: This function always eats your references to la_dinode and @@ -1339,8 +1296,6 @@ static int __ocfs2_recovery_thread(void *arg) int rm_quota_used = 0, i; struct ocfs2_quota_recovery *qrec; - mlog_entry_void(); - status = ocfs2_wait_on_mount(osb); if (status < 0) { goto bail; @@ -1372,15 +1327,12 @@ restart: * clear it until ocfs2_recover_node() has succeeded. */ node_num = rm->rm_entries[0]; spin_unlock(&osb->osb_lock); - mlog(0, "checking node %d\n", node_num); slot_num = ocfs2_node_num_to_slot(osb, node_num); + trace_ocfs2_recovery_thread_node(node_num, slot_num); if (slot_num == -ENOENT) { status = 0; - mlog(0, "no slot for this node, so no recovery" - "required.\n"); goto skip_recovery; } - mlog(0, "node %d was using slot %d\n", node_num, slot_num); /* It is a bit subtle with quota recovery. We cannot do it * immediately because we have to obtain cluster locks from @@ -1407,7 +1359,7 @@ skip_recovery: spin_lock(&osb->osb_lock); } spin_unlock(&osb->osb_lock); - mlog(0, "All nodes recovered\n"); + trace_ocfs2_recovery_thread_end(status); /* Refresh all journal recovery generations from disk */ status = ocfs2_check_journals_nolocks(osb); @@ -1451,7 +1403,6 @@ bail: if (rm_quota) kfree(rm_quota); - mlog_exit(status); /* no one is callint kthread_stop() for us so the kthread() api * requires that we call do_exit(). And it isn't exported, but * complete_and_exit() seems to be a minimal wrapper around it. */ @@ -1461,19 +1412,15 @@ bail: void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) { - mlog_entry("(node_num=%d, osb->node_num = %d)\n", - node_num, osb->node_num); - mutex_lock(&osb->recovery_lock); - if (osb->disable_recovery) - goto out; - /* People waiting on recovery will wait on - * the recovery map to empty. */ - if (ocfs2_recovery_map_set(osb, node_num)) - mlog(0, "node %d already in recovery map.\n", node_num); + trace_ocfs2_recovery_thread(node_num, osb->node_num, + osb->disable_recovery, osb->recovery_thread_task, + osb->disable_recovery ? + -1 : ocfs2_recovery_map_set(osb, node_num)); - mlog(0, "starting recovery thread...\n"); + if (osb->disable_recovery) + goto out; if (osb->recovery_thread_task) goto out; @@ -1488,8 +1435,6 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) out: mutex_unlock(&osb->recovery_lock); wake_up(&osb->recovery_event); - - mlog_exit_void(); } static int ocfs2_read_journal_inode(struct ocfs2_super *osb, @@ -1563,7 +1508,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, * If not, it needs recovery. */ if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) { - mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num, + trace_ocfs2_replay_journal_recovered(slot_num, osb->slot_recovery_generations[slot_num], slot_reco_gen); osb->slot_recovery_generations[slot_num] = slot_reco_gen; status = -EBUSY; @@ -1574,7 +1519,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); if (status < 0) { - mlog(0, "status returned from ocfs2_inode_lock=%d\n", status); + trace_ocfs2_replay_journal_lock_err(status); if (status != -ERESTARTSYS) mlog(ML_ERROR, "Could not lock journal!\n"); goto done; @@ -1587,7 +1532,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, slot_reco_gen = ocfs2_get_recovery_generation(fe); if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { - mlog(0, "No recovery required for node %d\n", node_num); + trace_ocfs2_replay_journal_skip(node_num); /* Refresh recovery generation for the slot */ osb->slot_recovery_generations[slot_num] = slot_reco_gen; goto done; @@ -1608,7 +1553,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, goto done; } - mlog(0, "calling journal_init_inode\n"); journal = jbd2_journal_init_inode(inode); if (journal == NULL) { mlog(ML_ERROR, "Linux journal layer error\n"); @@ -1628,7 +1572,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, ocfs2_clear_journal_error(osb->sb, journal, slot_num); /* wipe the journal */ - mlog(0, "flushing the journal.\n"); jbd2_journal_lock_updates(journal); status = jbd2_journal_flush(journal); jbd2_journal_unlock_updates(journal); @@ -1665,7 +1608,6 @@ done: brelse(bh); - mlog_exit(status); return status; } @@ -1688,8 +1630,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, struct ocfs2_dinode *la_copy = NULL; struct ocfs2_dinode *tl_copy = NULL; - mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n", - node_num, slot_num, osb->node_num); + trace_ocfs2_recover_node(node_num, slot_num, osb->node_num); /* Should not ever be called to recover ourselves -- in that * case we should've called ocfs2_journal_load instead. */ @@ -1698,9 +1639,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, status = ocfs2_replay_journal(osb, node_num, slot_num); if (status < 0) { if (status == -EBUSY) { - mlog(0, "Skipping recovery for slot %u (node %u) " - "as another node has recovered it\n", slot_num, - node_num); + trace_ocfs2_recover_node_skip(slot_num, node_num); status = 0; goto done; } @@ -1735,7 +1674,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, status = 0; done: - mlog_exit(status); return status; } @@ -1808,8 +1746,8 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); osb->slot_recovery_generations[i] = gen; - mlog(0, "Slot %u recovery generation is %u\n", i, - osb->slot_recovery_generations[i]); + trace_ocfs2_mark_dead_nodes(i, + osb->slot_recovery_generations[i]); if (i == osb->slot_num) { spin_unlock(&osb->osb_lock); @@ -1845,7 +1783,6 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) status = 0; bail: - mlog_exit(status); return status; } @@ -1884,11 +1821,12 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) os = &osb->osb_orphan_scan; - mlog(0, "Begin orphan scan\n"); - if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) goto out; + trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno, + atomic_read(&os->os_state)); + status = ocfs2_orphan_scan_lock(osb, &seqno); if (status < 0) { if (status != -EAGAIN) @@ -1918,7 +1856,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) unlock: ocfs2_orphan_scan_unlock(osb, seqno); out: - mlog(0, "Orphan scan completed\n"); + trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno, + atomic_read(&os->os_state)); return; } @@ -2002,8 +1941,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len, if (IS_ERR(iter)) return 0; - mlog(0, "queue orphan %llu\n", - (unsigned long long)OCFS2_I(iter)->ip_blkno); + trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); /* No locking is required for the next_orphan queue as there * is only ever a single process doing orphan recovery. */ OCFS2_I(iter)->ip_next_orphan = p->head; @@ -2119,7 +2057,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, struct inode *iter; struct ocfs2_inode_info *oi; - mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); + trace_ocfs2_recover_orphans(slot); ocfs2_mark_recovering_orphan_dir(osb, slot); ret = ocfs2_queue_orphans(osb, slot, &inode); @@ -2132,7 +2070,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, while (inode) { oi = OCFS2_I(inode); - mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno); + trace_ocfs2_recover_orphans_iput( + (unsigned long long)oi->ip_blkno); iter = oi->ip_next_orphan; @@ -2170,6 +2109,7 @@ static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota) * MOUNTED flag, but this is set right before * dismount_volume() so we can trust it. */ if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { + trace_ocfs2_wait_on_mount(VOLUME_DISABLED); mlog(0, "mount error, exiting!\n"); return -EBUSY; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index ec6adbf8f55..210c3523754 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -29,7 +29,6 @@ #include <linux/highmem.h> #include <linux/bitops.h> -#define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> #include "ocfs2.h" @@ -43,6 +42,7 @@ #include "suballoc.h" #include "super.h" #include "sysfile.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -201,8 +201,7 @@ void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb) la_max_mb = ocfs2_clusters_to_megabytes(sb, ocfs2_local_alloc_size(sb) * 8); - mlog(0, "requested: %dM, max: %uM, default: %uM\n", - requested_mb, la_max_mb, la_default_mb); + trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb); if (requested_mb == -1) { /* No user request - use defaults */ @@ -276,8 +275,8 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) ret = 1; bail: - mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", - osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); + trace_ocfs2_alloc_should_use_local( + (unsigned long long)bits, osb->local_alloc_state, la_bits, ret); spin_unlock(&osb->osb_lock); return ret; } @@ -291,8 +290,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) struct inode *inode = NULL; struct ocfs2_local_alloc *la; - mlog_entry_void(); - if (osb->local_alloc_bits == 0) goto bail; @@ -364,9 +361,10 @@ bail: if (inode) iput(inode); - mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); + trace_ocfs2_load_local_alloc(osb->local_alloc_bits); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -388,8 +386,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) struct ocfs2_dinode *alloc_copy = NULL; struct ocfs2_dinode *alloc = NULL; - mlog_entry_void(); - cancel_delayed_work(&osb->la_enable_wq); flush_workqueue(ocfs2_wq); @@ -482,8 +478,6 @@ out: if (alloc_copy) kfree(alloc_copy); - - mlog_exit_void(); } /* @@ -502,7 +496,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, struct inode *inode = NULL; struct ocfs2_dinode *alloc; - mlog_entry("(slot_num = %d)\n", slot_num); + trace_ocfs2_begin_local_alloc_recovery(slot_num); *alloc_copy = NULL; @@ -552,7 +546,8 @@ bail: iput(inode); } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -570,8 +565,6 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, struct buffer_head *main_bm_bh = NULL; struct inode *main_bm_inode; - mlog_entry_void(); - main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -620,7 +613,8 @@ out_mutex: out: if (!status) ocfs2_init_steal_slots(osb); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -640,8 +634,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, struct inode *local_alloc_inode; unsigned int free_bits; - mlog_entry_void(); - BUG_ON(!ac); local_alloc_inode = @@ -712,10 +704,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - if (ac->ac_max_block) - mlog(0, "Calling in_range for max block %llu\n", - (unsigned long long)ac->ac_max_block); - ac->ac_inode = local_alloc_inode; /* We should never use localalloc from another slot */ ac->ac_alloc_slot = osb->slot_num; @@ -729,10 +717,12 @@ bail: iput(local_alloc_inode); } - mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num, - status); + trace_ocfs2_reserve_local_alloc_bits( + (unsigned long long)ac->ac_max_block, + bits_wanted, osb->slot_num, status); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -749,7 +739,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, struct ocfs2_dinode *alloc; struct ocfs2_local_alloc *la; - mlog_entry_void(); BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); local_alloc_inode = ac->ac_inode; @@ -788,7 +777,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, osb->local_alloc_bh); bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -799,13 +789,11 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) u32 count = 0; struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); - mlog_entry_void(); - buffer = la->la_bitmap; for (i = 0; i < le16_to_cpu(la->la_size); i++) count += hweight8(buffer[i]); - mlog_exit(count); + trace_ocfs2_local_alloc_count_bits(count); return count; } @@ -820,10 +808,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, void *bitmap = NULL; struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap; - mlog_entry("(numbits wanted = %u)\n", *numbits); - if (!alloc->id1.bitmap1.i_total) { - mlog(0, "No bits in my window!\n"); bitoff = -1; goto bail; } @@ -883,8 +868,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, } } - mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, - numfound); + trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound); if (numfound == *numbits) bitoff = startoff - numfound; @@ -895,7 +879,10 @@ bail: if (local_resv) ocfs2_resv_discard(resmap, resv); - mlog_exit(bitoff); + trace_ocfs2_local_alloc_find_clear_bits(*numbits, + le32_to_cpu(alloc->id1.bitmap1.i_total), + bitoff, numfound); + return bitoff; } @@ -903,15 +890,12 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc) { struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); int i; - mlog_entry_void(); alloc->id1.bitmap1.i_total = 0; alloc->id1.bitmap1.i_used = 0; la->la_bm_off = 0; for(i = 0; i < le16_to_cpu(la->la_size); i++) la->la_bitmap[i] = 0; - - mlog_exit_void(); } #if 0 @@ -952,18 +936,16 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, void *bitmap; struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); - mlog_entry("total = %u, used = %u\n", - le32_to_cpu(alloc->id1.bitmap1.i_total), - le32_to_cpu(alloc->id1.bitmap1.i_used)); + trace_ocfs2_sync_local_to_main( + le32_to_cpu(alloc->id1.bitmap1.i_total), + le32_to_cpu(alloc->id1.bitmap1.i_used)); if (!alloc->id1.bitmap1.i_total) { - mlog(0, "nothing to sync!\n"); goto bail; } if (le32_to_cpu(alloc->id1.bitmap1.i_used) == le32_to_cpu(alloc->id1.bitmap1.i_total)) { - mlog(0, "all bits were taken!\n"); goto bail; } @@ -985,8 +967,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, ocfs2_clusters_to_blocks(osb->sb, start - count); - mlog(0, "freeing %u bits starting at local alloc bit " - "%u (la_start_blk = %llu, blkno = %llu)\n", + trace_ocfs2_sync_local_to_main_free( count, start - count, (unsigned long long)la_start_blk, (unsigned long long)blkno); @@ -1007,7 +988,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, } bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1132,7 +1114,8 @@ bail: *ac = NULL; } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1148,17 +1131,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, struct ocfs2_dinode *alloc = NULL; struct ocfs2_local_alloc *la; - mlog_entry_void(); - alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; la = OCFS2_LOCAL_ALLOC(alloc); - if (alloc->id1.bitmap1.i_total) - mlog(0, "asking me to alloc a new window over a non-empty " - "one\n"); - - mlog(0, "Allocating %u clusters for a new window.\n", - osb->local_alloc_bits); + trace_ocfs2_local_alloc_new_window( + le32_to_cpu(alloc->id1.bitmap1.i_total), + osb->local_alloc_bits); /* Instruct the allocation code to try the most recently used * cluster group. We'll re-record the group used this pass @@ -1220,13 +1198,13 @@ retry_enospc: ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count, OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); - mlog(0, "New window allocated:\n"); - mlog(0, "window la_bm_off = %u\n", - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); - mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total)); + trace_ocfs2_local_alloc_new_window_result( + OCFS2_LOCAL_ALLOC(alloc)->la_bm_off, + le32_to_cpu(alloc->id1.bitmap1.i_total)); bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1243,8 +1221,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct ocfs2_dinode *alloc_copy = NULL; struct ocfs2_alloc_context *ac = NULL; - mlog_entry_void(); - ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE); /* This will lock the main bitmap for us. */ @@ -1324,7 +1300,8 @@ bail: if (ac) ocfs2_free_alloc_context(ac); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index b5cb3ede940..e57c804069e 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/fcntl.h> -#define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> #include "ocfs2.h" diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 7e32db9c2c9..3e9393ca39e 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -31,7 +31,6 @@ #include <linux/signal.h> #include <linux/rbtree.h> -#define MLOG_MASK_PREFIX ML_FILE_IO #include <cluster/masklog.h> #include "ocfs2.h" @@ -42,6 +41,7 @@ #include "inode.h" #include "mmap.h" #include "super.h" +#include "ocfs2_trace.h" static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) @@ -49,13 +49,12 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) sigset_t oldset; int ret; - mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff); - ocfs2_block_signals(&oldset); ret = filemap_fault(area, vmf); ocfs2_unblock_signals(&oldset); - mlog_exit_ptr(vmf->page); + trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno, + area, vmf->page, vmf->pgoff); return ret; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d6c25d76b53..28f2cc1080d 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -42,7 +42,6 @@ #include <linux/highmem.h> #include <linux/quotaops.h> -#define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> #include "ocfs2.h" @@ -63,6 +62,7 @@ #include "uptodate.h" #include "xattr.h" #include "acl.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -106,17 +106,15 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, struct dentry *ret; struct ocfs2_inode_info *oi; - mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, - dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_lookup(dir, dentry, dentry->d_name.len, + dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, 0); if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) { ret = ERR_PTR(-ENAMETOOLONG); goto bail; } - mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len, - dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); - status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT); if (status < 0) { if (status != -ENOENT) @@ -182,7 +180,7 @@ bail_unlock: bail: - mlog_exit_ptr(ret); + trace_ocfs2_lookup_ret(ret); return ret; } @@ -235,9 +233,9 @@ static int ocfs2_mknod(struct inode *dir, sigset_t oldset; int did_block_signals = 0; - mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, - (unsigned long)dev, dentry->d_name.len, - dentry->d_name.name); + trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long)dev, mode); dquot_initialize(dir); @@ -354,10 +352,6 @@ static int ocfs2_mknod(struct inode *dir, goto leave; did_quota_inode = 1; - mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, - inode->i_mode, (unsigned long)dev, dentry->d_name.len, - dentry->d_name.name); - /* do the real work now. */ status = ocfs2_mknod_locked(osb, dir, inode, dev, &new_fe_bh, parent_fe_bh, handle, @@ -436,9 +430,6 @@ leave: if (did_block_signals) ocfs2_unblock_signals(&oldset); - if (status == -ENOSPC) - mlog(0, "Disk is full\n"); - brelse(new_fe_bh); brelse(parent_fe_bh); kfree(si.name); @@ -466,7 +457,8 @@ leave: iput(inode); } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -577,7 +569,8 @@ leave: } } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -615,10 +608,11 @@ static int ocfs2_mkdir(struct inode *dir, { int ret; - mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, - dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, + OCFS2_I(dir)->ip_blkno, mode); ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); - mlog_exit(ret); + if (ret) + mlog_errno(ret); return ret; } @@ -630,10 +624,11 @@ static int ocfs2_create(struct inode *dir, { int ret; - mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, - dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); - mlog_exit(ret); + if (ret) + mlog_errno(ret); return ret; } @@ -652,9 +647,9 @@ static int ocfs2_link(struct dentry *old_dentry, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; - mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, - old_dentry->d_name.len, old_dentry->d_name.name, - dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno, + old_dentry->d_name.len, old_dentry->d_name.name, + dentry->d_name.len, dentry->d_name.name); if (S_ISDIR(inode->i_mode)) return -EPERM; @@ -757,7 +752,8 @@ out: ocfs2_free_dir_lookup_result(&lookup); - mlog_exit(err); + if (err) + mlog_errno(err); return err; } @@ -809,19 +805,17 @@ static int ocfs2_unlink(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; - mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, - dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_unlink(dir, dentry, dentry->d_name.len, + dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); dquot_initialize(dir); BUG_ON(dentry->d_parent->d_inode != dir); - mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); - - if (inode == osb->root_inode) { - mlog(0, "Cannot delete the root directory\n"); + if (inode == osb->root_inode) return -EPERM; - } status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1, OI_LS_PARENT); @@ -843,9 +837,10 @@ static int ocfs2_unlink(struct inode *dir, if (OCFS2_I(inode)->ip_blkno != blkno) { status = -ENOENT; - mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)blkno, OCFS2_I(inode)->ip_flags); + trace_ocfs2_unlink_noent( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)blkno, + OCFS2_I(inode)->ip_flags); goto leave; } @@ -954,7 +949,8 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -975,9 +971,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, struct buffer_head **tmpbh; struct inode *tmpinode; - mlog_entry("(inode1 = %llu, inode2 = %llu)\n", - (unsigned long long)oi1->ip_blkno, - (unsigned long long)oi2->ip_blkno); + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); if (*bh1) *bh1 = NULL; @@ -988,7 +983,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, if (oi1->ip_blkno != oi2->ip_blkno) { if (oi1->ip_blkno < oi2->ip_blkno) { /* switch id1 and id2 around */ - mlog(0, "switching them around...\n"); tmpbh = bh2; bh2 = bh1; bh1 = tmpbh; @@ -1024,8 +1018,13 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, mlog_errno(status); } + trace_ocfs2_double_lock_end( + (unsigned long long)OCFS2_I(inode1)->ip_blkno, + (unsigned long long)OCFS2_I(inode2)->ip_blkno); + bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1067,10 +1066,9 @@ static int ocfs2_rename(struct inode *old_dir, /* At some point it might be nice to break this function up a * bit. */ - mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n", - old_dir, old_dentry, new_dir, new_dentry, - old_dentry->d_name.len, old_dentry->d_name.name, - new_dentry->d_name.len, new_dentry->d_name.name); + trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry, + old_dentry->d_name.len, old_dentry->d_name.name, + new_dentry->d_name.len, new_dentry->d_name.name); dquot_initialize(old_dir); dquot_initialize(new_dir); @@ -1227,16 +1225,15 @@ static int ocfs2_rename(struct inode *old_dir, if (!new_inode) { status = -EACCES; - mlog(0, "We found an inode for name %.*s but VFS " - "didn't give us one.\n", new_dentry->d_name.len, - new_dentry->d_name.name); + trace_ocfs2_rename_target_exists(new_dentry->d_name.len, + new_dentry->d_name.name); goto bail; } if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) { status = -EACCES; - mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n", + trace_ocfs2_rename_disagree( (unsigned long long)OCFS2_I(new_inode)->ip_blkno, (unsigned long long)newfe_blkno, OCFS2_I(new_inode)->ip_flags); @@ -1259,8 +1256,7 @@ static int ocfs2_rename(struct inode *old_dir, newfe = (struct ocfs2_dinode *) newfe_bh->b_data; - mlog(0, "aha rename over existing... new_blkno=%llu " - "newfebh=%p bhblocknr=%llu\n", + trace_ocfs2_rename_over_existing( (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? (unsigned long long)newfe_bh->b_blocknr : 0ULL); @@ -1476,7 +1472,8 @@ bail: brelse(old_dir_bh); brelse(new_dir_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1501,9 +1498,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, * write i_size + 1 bytes. */ blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits; - mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n", - (unsigned long long)inode->i_blocks, - i_size_read(inode), blocks); + trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks, + i_size_read(inode), blocks); /* Sanity check -- make sure we're going to fit. */ if (bytes_left > @@ -1579,7 +1575,8 @@ bail: kfree(bhs); } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1610,8 +1607,8 @@ static int ocfs2_symlink(struct inode *dir, sigset_t oldset; int did_block_signals = 0; - mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, - dentry, symname, dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_symlink_begin(dir, dentry, symname, + dentry->d_name.len, dentry->d_name.name); dquot_initialize(dir); @@ -1713,9 +1710,10 @@ static int ocfs2_symlink(struct inode *dir, goto bail; did_quota_inode = 1; - mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, - inode->i_mode, dentry->d_name.len, - dentry->d_name.name); + trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len, + dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + inode->i_mode); status = ocfs2_mknod_locked(osb, dir, inode, 0, &new_fe_bh, parent_fe_bh, handle, @@ -1835,7 +1833,8 @@ bail: iput(inode); } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1844,8 +1843,6 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name) { int status, namelen; - mlog_entry_void(); - namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx", (long long)blkno); if (namelen <= 0) { @@ -1862,12 +1859,12 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name) goto bail; } - mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name, - namelen); + trace_ocfs2_blkno_stringify(blkno, name, namelen); status = 0; bail: - mlog_exit(status); + if (status < 0) + mlog_errno(status); return status; } @@ -1980,7 +1977,8 @@ out: iput(orphan_dir_inode); } - mlog_exit(ret); + if (ret) + mlog_errno(ret); return ret; } @@ -1997,7 +1995,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct ocfs2_dinode *orphan_fe; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; - mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); + trace_ocfs2_orphan_add_begin( + (unsigned long long)OCFS2_I(inode)->ip_blkno); status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh); if (status < 0) { @@ -2056,13 +2055,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, fe_bh); - mlog(0, "Inode %llu orphaned in slot %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); + trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno, + osb->slot_num); leave: brelse(orphan_dir_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2078,17 +2078,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; - mlog_entry_void(); - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); if (status < 0) { mlog_errno(status); goto leave; } - mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n", - name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - OCFS2_ORPHAN_NAMELEN); + trace_ocfs2_orphan_del( + (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, + name, OCFS2_ORPHAN_NAMELEN); /* find it's spot in the orphan directory */ status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, @@ -2124,7 +2122,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, leave: ocfs2_free_dir_lookup_result(&lookup); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2321,9 +2320,6 @@ leave: iput(orphan_dir); } - if (status == -ENOSPC) - mlog(0, "Disk is full\n"); - if ((status < 0) && inode) { clear_nlink(inode); iput(inode); @@ -2358,8 +2354,10 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct buffer_head *di_bh = NULL; struct ocfs2_dir_lookup_result lookup = { NULL, }; - mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry, - dentry->d_name.len, dentry->d_name.name); + trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry, + dentry->d_name.len, dentry->d_name.name, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); status = ocfs2_inode_lock(dir, &parent_di_bh, 1); if (status < 0) { @@ -2476,7 +2474,8 @@ leave: ocfs2_free_dir_lookup_result(&lookup); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 1a97ba1ec3f..409285854f6 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -147,6 +147,17 @@ struct ocfs2_lock_res_ops; typedef void (*ocfs2_lock_callback)(int status, unsigned long data); +#ifdef CONFIG_OCFS2_FS_STATS +struct ocfs2_lock_stats { + u64 ls_total; /* Total wait in NSEC */ + u32 ls_gets; /* Num acquires */ + u32 ls_fail; /* Num failed acquires */ + + /* Storing max wait in usecs saves 24 bytes per inode */ + u32 ls_max; /* Max wait in USEC */ +}; +#endif + struct ocfs2_lock_res { void *l_priv; struct ocfs2_lock_res_ops *l_ops; @@ -182,15 +193,9 @@ struct ocfs2_lock_res { struct list_head l_debug_list; #ifdef CONFIG_OCFS2_FS_STATS - unsigned long long l_lock_num_prmode; /* PR acquires */ - unsigned long long l_lock_num_exmode; /* EX acquires */ - unsigned int l_lock_num_prmode_failed; /* Failed PR gets */ - unsigned int l_lock_num_exmode_failed; /* Failed EX gets */ - unsigned long long l_lock_total_prmode; /* Tot wait for PR */ - unsigned long long l_lock_total_exmode; /* Tot wait for EX */ - unsigned int l_lock_max_prmode; /* Max wait for PR */ - unsigned int l_lock_max_exmode; /* Max wait for EX */ - unsigned int l_lock_refresh; /* Disk refreshes */ + struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ + u32 l_lock_refresh; /* Disk refreshes */ + struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map l_lockdep_map; diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h new file mode 100644 index 00000000000..a1dae5bb54a --- /dev/null +++ b/fs/ocfs2/ocfs2_trace.h @@ -0,0 +1,2739 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ocfs2 + +#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OCFS2_H + +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(ocfs2__int, + TP_PROTO(int num), + TP_ARGS(num), + TP_STRUCT__entry( + __field(int, num) + ), + TP_fast_assign( + __entry->num = num; + ), + TP_printk("%d", __entry->num) +); + +#define DEFINE_OCFS2_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__int, name, \ + TP_PROTO(int num), \ + TP_ARGS(num)) + +DECLARE_EVENT_CLASS(ocfs2__uint, + TP_PROTO(unsigned int num), + TP_ARGS(num), + TP_STRUCT__entry( + __field( unsigned int, num ) + ), + TP_fast_assign( + __entry->num = num; + ), + TP_printk("%u", __entry->num) +); + +#define DEFINE_OCFS2_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint, name, \ + TP_PROTO(unsigned int num), \ + TP_ARGS(num)) + +DECLARE_EVENT_CLASS(ocfs2__ull, + TP_PROTO(unsigned long long blkno), + TP_ARGS(blkno), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->blkno = blkno; + ), + TP_printk("%llu", __entry->blkno) +); + +#define DEFINE_OCFS2_ULL_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull, name, \ + TP_PROTO(unsigned long long num), \ + TP_ARGS(num)) + +DECLARE_EVENT_CLASS(ocfs2__pointer, + TP_PROTO(void *pointer), + TP_ARGS(pointer), + TP_STRUCT__entry( + __field(void *, pointer) + ), + TP_fast_assign( + __entry->pointer = pointer; + ), + TP_printk("%p", __entry->pointer) +); + +#define DEFINE_OCFS2_POINTER_EVENT(name) \ +DEFINE_EVENT(ocfs2__pointer, name, \ + TP_PROTO(void *pointer), \ + TP_ARGS(pointer)) + +DECLARE_EVENT_CLASS(ocfs2__string, + TP_PROTO(const char *name), + TP_ARGS(name), + TP_STRUCT__entry( + __string(name,name) + ), + TP_fast_assign( + __assign_str(name, name); + ), + TP_printk("%s", __get_str(name)) +); + +#define DEFINE_OCFS2_STRING_EVENT(name) \ +DEFINE_EVENT(ocfs2__string, name, \ + TP_PROTO(const char *name), \ + TP_ARGS(name)) + +DECLARE_EVENT_CLASS(ocfs2__int_int, + TP_PROTO(int value1, int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(int, value1) + __field(int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%d %d", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_INT_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__int_int, name, \ + TP_PROTO(int val1, int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__uint_int, + TP_PROTO(unsigned int value1, int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned int, value1) + __field(int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%u %d", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_UINT_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint_int, name, \ + TP_PROTO(unsigned int val1, int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__uint_uint, + TP_PROTO(unsigned int value1, unsigned int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned int, value1) + __field(unsigned int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%u %u", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint_uint, name, \ + TP_PROTO(unsigned int val1, unsigned int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_uint, + TP_PROTO(unsigned long long value1, unsigned int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %u", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_ULL_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_uint, name, \ + TP_PROTO(unsigned long long val1, unsigned int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_int, + TP_PROTO(unsigned long long value1, int value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(int, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %d", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_ULL_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_int, name, \ + TP_PROTO(unsigned long long val1, int val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull, + TP_PROTO(unsigned long long value1, unsigned long long value2), + TP_ARGS(value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %llu", __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_ULL_ULL_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull, name, \ + TP_PROTO(unsigned long long val1, unsigned long long val2), \ + TP_ARGS(val1, val2)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint, + TP_PROTO(unsigned long long value1, + unsigned long long value2, unsigned int value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + __field(unsigned int, value3) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %llu %u", + __entry->value1, __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull_uint, name, \ + TP_PROTO(unsigned long long val1, \ + unsigned long long val2, unsigned int val3), \ + TP_ARGS(val1, val2, val3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint, + TP_PROTO(unsigned long long value1, + unsigned int value2, unsigned int value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned int, value2) + __field(unsigned int, value3) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %u %u", __entry->value1, + __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_uint_uint, name, \ + TP_PROTO(unsigned long long val1, \ + unsigned int val2, unsigned int val3), \ + TP_ARGS(val1, val2, val3)) + +DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint, + TP_PROTO(unsigned int value1, unsigned int value2, + unsigned int value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field( unsigned int, value1 ) + __field( unsigned int, value2 ) + __field( unsigned int, value3 ) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__uint_uint_uint, name, \ + TP_PROTO(unsigned int value1, unsigned int value2, \ + unsigned int value3), \ + TP_ARGS(value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull, + TP_PROTO(unsigned long long value1, + unsigned long long value2, unsigned long long value3), + TP_ARGS(value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + __field(unsigned long long, value3) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %llu %llu", + __entry->value1, __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull_ull, name, \ + TP_PROTO(unsigned long long value1, unsigned long long value2, \ + unsigned long long value3), \ + TP_ARGS(value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int, + TP_PROTO(unsigned long long ull, int value1, int value2, int value3), + TP_ARGS(ull, value1, value2, value3), + TP_STRUCT__entry( + __field( unsigned long long, ull ) + __field( int, value1 ) + __field( int, value2 ) + __field( int, value3 ) + ), + TP_fast_assign( + __entry->ull = ull; + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %d %d %d", + __entry->ull, __entry->value1, + __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_int_int_int, name, \ + TP_PROTO(unsigned long long ull, int value1, \ + int value2, int value3), \ + TP_ARGS(ull, value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint, + TP_PROTO(unsigned long long ull, unsigned int value1, + unsigned int value2, unsigned int value3), + TP_ARGS(ull, value1, value2, value3), + TP_STRUCT__entry( + __field(unsigned long long, ull) + __field(unsigned int, value1) + __field(unsigned int, value2) + __field(unsigned int, value3) + ), + TP_fast_assign( + __entry->ull = ull; + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + ), + TP_printk("%llu %u %u %u", + __entry->ull, __entry->value1, + __entry->value2, __entry->value3) +); + +#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name, \ + TP_PROTO(unsigned long long ull, unsigned int value1, \ + unsigned int value2, unsigned int value3), \ + TP_ARGS(ull, value1, value2, value3)) + +DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint, + TP_PROTO(unsigned long long value1, unsigned long long value2, + unsigned int value3, unsigned int value4), + TP_ARGS(value1, value2, value3, value4), + TP_STRUCT__entry( + __field(unsigned long long, value1) + __field(unsigned long long, value2) + __field(unsigned int, value3) + __field(unsigned int, value4) + ), + TP_fast_assign( + __entry->value1 = value1; + __entry->value2 = value2; + __entry->value3 = value3; + __entry->value4 = value4; + ), + TP_printk("%llu %llu %u %u", + __entry->value1, __entry->value2, + __entry->value3, __entry->value4) +); + +#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name) \ +DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name, \ + TP_PROTO(unsigned long long ull, unsigned long long ull1, \ + unsigned int value2, unsigned int value3), \ + TP_ARGS(ull, ull1, value2, value3)) + +/* Trace events for fs/ocfs2/alloc.c. */ +DECLARE_EVENT_CLASS(ocfs2__btree_ops, + TP_PROTO(unsigned long long owner,\ + unsigned int value1, unsigned int value2), + TP_ARGS(owner, value1, value2), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned int, value1) + __field(unsigned int, value2) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->value1 = value1; + __entry->value2 = value2; + ), + TP_printk("%llu %u %u", + __entry->owner, __entry->value1, __entry->value2) +); + +#define DEFINE_OCFS2_BTREE_EVENT(name) \ +DEFINE_EVENT(ocfs2__btree_ops, name, \ + TP_PROTO(unsigned long long owner, \ + unsigned int value1, unsigned int value2), \ + TP_ARGS(owner, value1, value2)) + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start); + +DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree); + +DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents); + +DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert); + +TRACE_EVENT(ocfs2_grow_tree, + TP_PROTO(unsigned long long owner, int depth), + TP_ARGS(owner, depth), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(int, depth) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->depth = depth; + ), + TP_printk("%llu %d", __entry->owner, __entry->depth) +); + +TRACE_EVENT(ocfs2_rotate_subtree, + TP_PROTO(int subtree_root, unsigned long long blkno, + int depth), + TP_ARGS(subtree_root, blkno, depth), + TP_STRUCT__entry( + __field(int, subtree_root) + __field(unsigned long long, blkno) + __field(int, depth) + ), + TP_fast_assign( + __entry->subtree_root = subtree_root; + __entry->blkno = blkno; + __entry->depth = depth; + ), + TP_printk("%d %llu %d", __entry->subtree_root, + __entry->blkno, __entry->depth) +); + +TRACE_EVENT(ocfs2_insert_extent, + TP_PROTO(unsigned int ins_appending, unsigned int ins_contig, + int ins_contig_index, int free_records, int ins_tree_depth), + TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records, + ins_tree_depth), + TP_STRUCT__entry( + __field(unsigned int, ins_appending) + __field(unsigned int, ins_contig) + __field(int, ins_contig_index) + __field(int, free_records) + __field(int, ins_tree_depth) + ), + TP_fast_assign( + __entry->ins_appending = ins_appending; + __entry->ins_contig = ins_contig; + __entry->ins_contig_index = ins_contig_index; + __entry->free_records = free_records; + __entry->ins_tree_depth = ins_tree_depth; + ), + TP_printk("%u %u %d %d %d", + __entry->ins_appending, __entry->ins_contig, + __entry->ins_contig_index, __entry->free_records, + __entry->ins_tree_depth) +); + +TRACE_EVENT(ocfs2_split_extent, + TP_PROTO(int split_index, unsigned int c_contig_type, + unsigned int c_has_empty_extent, + unsigned int c_split_covers_rec), + TP_ARGS(split_index, c_contig_type, + c_has_empty_extent, c_split_covers_rec), + TP_STRUCT__entry( + __field(int, split_index) + __field(unsigned int, c_contig_type) + __field(unsigned int, c_has_empty_extent) + __field(unsigned int, c_split_covers_rec) + ), + TP_fast_assign( + __entry->split_index = split_index; + __entry->c_contig_type = c_contig_type; + __entry->c_has_empty_extent = c_has_empty_extent; + __entry->c_split_covers_rec = c_split_covers_rec; + ), + TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type, + __entry->c_has_empty_extent, __entry->c_split_covers_rec) +); + +TRACE_EVENT(ocfs2_remove_extent, + TP_PROTO(unsigned long long owner, unsigned int cpos, + unsigned int len, int index, + unsigned int e_cpos, unsigned int clusters), + TP_ARGS(owner, cpos, len, index, e_cpos, clusters), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned int, cpos) + __field(unsigned int, len) + __field(int, index) + __field(unsigned int, e_cpos) + __field(unsigned int, clusters) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->cpos = cpos; + __entry->len = len; + __entry->index = index; + __entry->e_cpos = e_cpos; + __entry->clusters = clusters; + ), + TP_printk("%llu %u %u %d %u %u", + __entry->owner, __entry->cpos, __entry->len, __entry->index, + __entry->e_cpos, __entry->clusters) +); + +TRACE_EVENT(ocfs2_commit_truncate, + TP_PROTO(unsigned long long ino, unsigned int new_cpos, + unsigned int clusters, unsigned int depth), + TP_ARGS(ino, new_cpos, clusters, depth), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, new_cpos) + __field(unsigned int, clusters) + __field(unsigned int, depth) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->new_cpos = new_cpos; + __entry->clusters = clusters; + __entry->depth = depth; + ), + TP_printk("%llu %u %u %u", + __entry->ino, __entry->new_cpos, + __entry->clusters, __entry->depth) +); + +TRACE_EVENT(ocfs2_validate_extent_block, + TP_PROTO(unsigned long long blkno), + TP_ARGS(blkno), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->blkno = blkno; + ), + TP_printk("%llu ", __entry->blkno) +); + +TRACE_EVENT(ocfs2_rotate_leaf, + TP_PROTO(unsigned int insert_cpos, int insert_index, + int has_empty, int next_free, + unsigned int l_count), + TP_ARGS(insert_cpos, insert_index, has_empty, + next_free, l_count), + TP_STRUCT__entry( + __field(unsigned int, insert_cpos) + __field(int, insert_index) + __field(int, has_empty) + __field(int, next_free) + __field(unsigned int, l_count) + ), + TP_fast_assign( + __entry->insert_cpos = insert_cpos; + __entry->insert_index = insert_index; + __entry->has_empty = has_empty; + __entry->next_free = next_free; + __entry->l_count = l_count; + ), + TP_printk("%u %d %d %d %u", __entry->insert_cpos, + __entry->insert_index, __entry->has_empty, + __entry->next_free, __entry->l_count) +); + +TRACE_EVENT(ocfs2_add_clusters_in_btree_ret, + TP_PROTO(int status, int reason, int err), + TP_ARGS(status, reason, err), + TP_STRUCT__entry( + __field(int, status) + __field(int, reason) + __field(int, err) + ), + TP_fast_assign( + __entry->status = status; + __entry->reason = reason; + __entry->err = err; + ), + TP_printk("%d %d %d", __entry->status, + __entry->reason, __entry->err) +); + +TRACE_EVENT(ocfs2_mark_extent_written, + TP_PROTO(unsigned long long owner, unsigned int cpos, + unsigned int len, unsigned int phys), + TP_ARGS(owner, cpos, len, phys), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned int, cpos) + __field(unsigned int, len) + __field(unsigned int, phys) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->cpos = cpos; + __entry->len = len; + __entry->phys = phys; + ), + TP_printk("%llu %u %u %u", + __entry->owner, __entry->cpos, + __entry->len, __entry->phys) +); + +DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops, + TP_PROTO(unsigned long long blkno, int index, + unsigned int start, unsigned int num), + TP_ARGS(blkno, index, start, num), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + __field(int, index) + __field(unsigned int, start) + __field(unsigned int, num) + ), + TP_fast_assign( + __entry->blkno = blkno; + __entry->index = index; + __entry->start = start; + __entry->num = num; + ), + TP_printk("%llu %d %u %u", + __entry->blkno, __entry->index, + __entry->start, __entry->num) +); + +#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name) \ +DEFINE_EVENT(ocfs2__truncate_log_ops, name, \ + TP_PROTO(unsigned long long blkno, int index, \ + unsigned int start, unsigned int num), \ + TP_ARGS(blkno, index, start, num)) + +DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append); + +DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log); + +DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery); + +DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs); + +TRACE_EVENT(ocfs2_cache_block_dealloc, + TP_PROTO(int type, int slot, unsigned long long suballoc, + unsigned long long blkno, unsigned int bit), + TP_ARGS(type, slot, suballoc, blkno, bit), + TP_STRUCT__entry( + __field(int, type) + __field(int, slot) + __field(unsigned long long, suballoc) + __field(unsigned long long, blkno) + __field(unsigned int, bit) + ), + TP_fast_assign( + __entry->type = type; + __entry->slot = slot; + __entry->suballoc = suballoc; + __entry->blkno = blkno; + __entry->bit = bit; + ), + TP_printk("%d %d %llu %llu %u", + __entry->type, __entry->slot, __entry->suballoc, + __entry->blkno, __entry->bit) +); + +/* End of trace events for fs/ocfs2/alloc.c. */ + +/* Trace events for fs/ocfs2/localalloc.c. */ + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local); + +DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc); + +DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main); + +TRACE_EVENT(ocfs2_sync_local_to_main_free, + TP_PROTO(int count, int bit, unsigned long long start_blk, + unsigned long long blkno), + TP_ARGS(count, bit, start_blk, blkno), + TP_STRUCT__entry( + __field(int, count) + __field(int, bit) + __field(unsigned long long, start_blk) + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->count = count; + __entry->bit = bit; + __entry->start_blk = start_blk; + __entry->blkno = blkno; + ), + TP_printk("%d %d %llu %llu", + __entry->count, __entry->bit, __entry->start_blk, + __entry->blkno) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result); + +/* End of trace events for fs/ocfs2/localalloc.c. */ + +/* Trace events for fs/ocfs2/resize.c. */ + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add); + +/* End of trace events for fs/ocfs2/resize.c. */ + +/* Trace events for fs/ocfs2/suballoc.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits); + +TRACE_EVENT(ocfs2_relink_block_group, + TP_PROTO(unsigned long long i_blkno, unsigned int chain, + unsigned long long bg_blkno, + unsigned long long prev_blkno), + TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno), + TP_STRUCT__entry( + __field(unsigned long long, i_blkno) + __field(unsigned int, chain) + __field(unsigned long long, bg_blkno) + __field(unsigned long long, prev_blkno) + ), + TP_fast_assign( + __entry->i_blkno = i_blkno; + __entry->chain = chain; + __entry->bg_blkno = bg_blkno; + __entry->prev_blkno = prev_blkno; + ), + TP_printk("%llu %u %llu %llu", + __entry->i_blkno, __entry->chain, __entry->bg_blkno, + __entry->prev_blkno) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits); + +TRACE_EVENT(ocfs2_free_suballoc_bits, + TP_PROTO(unsigned long long inode, unsigned long long group, + unsigned int start_bit, unsigned int count), + TP_ARGS(inode, group, start_bit, count), + TP_STRUCT__entry( + __field(unsigned long long, inode) + __field(unsigned long long, group) + __field(unsigned int, start_bit) + __field(unsigned int, count) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->group = group; + __entry->start_bit = start_bit; + __entry->count = count; + ), + TP_printk("%llu %llu %u %u", __entry->inode, __entry->group, + __entry->start_bit, __entry->count) +); + +TRACE_EVENT(ocfs2_free_clusters, + TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk, + unsigned int start_bit, unsigned int count), + TP_ARGS(bg_blkno, start_blk, start_bit, count), + TP_STRUCT__entry( + __field(unsigned long long, bg_blkno) + __field(unsigned long long, start_blk) + __field(unsigned int, start_bit) + __field(unsigned int, count) + ), + TP_fast_assign( + __entry->bg_blkno = bg_blkno; + __entry->start_blk = start_blk; + __entry->start_bit = start_bit; + __entry->count = count; + ), + TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk, + __entry->start_bit, __entry->count) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit); + +/* End of trace events for fs/ocfs2/suballoc.c. */ + +/* Trace events for fs/ocfs2/refcounttree.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block); + +DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops, + TP_PROTO(unsigned long long blkno, int index, + unsigned long long cpos, + unsigned int clusters, unsigned int refcount), + TP_ARGS(blkno, index, cpos, clusters, refcount), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + __field(int, index) + __field(unsigned long long, cpos) + __field(unsigned int, clusters) + __field(unsigned int, refcount) + ), + TP_fast_assign( + __entry->blkno = blkno; + __entry->index = index; + __entry->cpos = cpos; + __entry->clusters = clusters; + __entry->refcount = refcount; + ), + TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index, + __entry->cpos, __entry->clusters, __entry->refcount) +); + +#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name) \ +DEFINE_EVENT(ocfs2__refcount_tree_ops, name, \ + TP_PROTO(unsigned long long blkno, int index, \ + unsigned long long cpos, \ + unsigned int count, unsigned int refcount), \ + TP_ARGS(blkno, index, cpos, count, refcount)) + +DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec); + +TRACE_EVENT(ocfs2_split_refcount_rec, + TP_PROTO(unsigned long long cpos, + unsigned int clusters, unsigned int refcount, + unsigned long long split_cpos, + unsigned int split_clusters, unsigned int split_refcount), + TP_ARGS(cpos, clusters, refcount, + split_cpos, split_clusters, split_refcount), + TP_STRUCT__entry( + __field(unsigned long long, cpos) + __field(unsigned int, clusters) + __field(unsigned int, refcount) + __field(unsigned long long, split_cpos) + __field(unsigned int, split_clusters) + __field(unsigned int, split_refcount) + ), + TP_fast_assign( + __entry->cpos = cpos; + __entry->clusters = clusters; + __entry->refcount = refcount; + __entry->split_cpos = split_cpos; + __entry->split_clusters = split_clusters; + __entry->split_refcount = split_refcount; + ), + TP_printk("%llu %u %u %llu %u %u", + __entry->cpos, __entry->clusters, __entry->refcount, + __entry->split_cpos, __entry->split_clusters, + __entry->split_refcount) +); + +DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec); + +TRACE_EVENT(ocfs2_decrease_refcount, + TP_PROTO(unsigned long long owner, + unsigned long long cpos, + unsigned int len, int delete), + TP_ARGS(owner, cpos, len, delete), + TP_STRUCT__entry( + __field(unsigned long long, owner) + __field(unsigned long long, cpos) + __field(unsigned int, len) + __field(int, delete) + ), + TP_fast_assign( + __entry->owner = owner; + __entry->cpos = cpos; + __entry->len = len; + __entry->delete = delete; + ), + TP_printk("%llu %llu %u %d", + __entry->owner, __entry->cpos, __entry->len, __entry->delete) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits); + +TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate, + TP_PROTO(int recs_add, unsigned long long cpos, + unsigned int clusters, unsigned long long r_cpos, + unsigned int r_clusters, unsigned int refcount, int index), + TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index), + TP_STRUCT__entry( + __field(int, recs_add) + __field(unsigned long long, cpos) + __field(unsigned int, clusters) + __field(unsigned long long, r_cpos) + __field(unsigned int, r_clusters) + __field(unsigned int, refcount) + __field(int, index) + ), + TP_fast_assign( + __entry->recs_add = recs_add; + __entry->cpos = cpos; + __entry->clusters = clusters; + __entry->r_cpos = r_cpos; + __entry->r_clusters = r_clusters; + __entry->refcount = refcount; + __entry->index = index; + ), + TP_printk("%d %llu %u %llu %u %u %d", + __entry->recs_add, __entry->cpos, __entry->clusters, + __entry->r_cpos, __entry->r_clusters, + __entry->refcount, __entry->index) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd); + +TRACE_EVENT(ocfs2_clear_ext_refcount, + TP_PROTO(unsigned long long ino, unsigned int cpos, + unsigned int len, unsigned int p_cluster, + unsigned int ext_flags), + TP_ARGS(ino, cpos, len, p_cluster, ext_flags), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, cpos) + __field(unsigned int, len) + __field(unsigned int, p_cluster) + __field(unsigned int, ext_flags) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->cpos = cpos; + __entry->len = len; + __entry->p_cluster = p_cluster; + __entry->ext_flags = ext_flags; + ), + TP_printk("%llu %u %u %u %u", + __entry->ino, __entry->cpos, __entry->len, + __entry->p_cluster, __entry->ext_flags) +); + +TRACE_EVENT(ocfs2_replace_clusters, + TP_PROTO(unsigned long long ino, unsigned int cpos, + unsigned int old, unsigned int new, unsigned int len, + unsigned int ext_flags), + TP_ARGS(ino, cpos, old, new, len, ext_flags), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, cpos) + __field(unsigned int, old) + __field(unsigned int, new) + __field(unsigned int, len) + __field(unsigned int, ext_flags) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->cpos = cpos; + __entry->old = old; + __entry->new = new; + __entry->len = len; + __entry->ext_flags = ext_flags; + ), + TP_printk("%llu %u %u %u %u %u", + __entry->ino, __entry->cpos, __entry->old, __entry->new, + __entry->len, __entry->ext_flags) +); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable); + +TRACE_EVENT(ocfs2_refcount_cow_hunk, + TP_PROTO(unsigned long long ino, unsigned int cpos, + unsigned int write_len, unsigned int max_cpos, + unsigned int cow_start, unsigned int cow_len), + TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, cpos) + __field(unsigned int, write_len) + __field(unsigned int, max_cpos) + __field(unsigned int, cow_start) + __field(unsigned int, cow_len) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->cpos = cpos; + __entry->write_len = write_len; + __entry->max_cpos = max_cpos; + __entry->cow_start = cow_start; + __entry->cow_len = cow_len; + ), + TP_printk("%llu %u %u %u %u %u", + __entry->ino, __entry->cpos, __entry->write_len, + __entry->max_cpos, __entry->cow_start, __entry->cow_len) +); + +/* End of trace events for fs/ocfs2/refcounttree.c. */ + +/* Trace events for fs/ocfs2/aops.c. */ + +DECLARE_EVENT_CLASS(ocfs2__get_block, + TP_PROTO(unsigned long long ino, unsigned long long iblock, + void *bh_result, int create), + TP_ARGS(ino, iblock, bh_result, create), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, iblock) + __field(void *, bh_result) + __field(int, create) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->iblock = iblock; + __entry->bh_result = bh_result; + __entry->create = create; + ), + TP_printk("%llu %llu %p %d", + __entry->ino, __entry->iblock, + __entry->bh_result, __entry->create) +); + +#define DEFINE_OCFS2_GET_BLOCK_EVENT(name) \ +DEFINE_EVENT(ocfs2__get_block, name, \ + TP_PROTO(unsigned long long ino, unsigned long long iblock, \ + void *bh_result, int create), \ + TP_ARGS(ino, iblock, bh_result, create)) + +DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block); + +DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap); + +TRACE_EVENT(ocfs2_try_to_write_inline_data, + TP_PROTO(unsigned long long ino, unsigned int len, + unsigned long long pos, unsigned int flags), + TP_ARGS(ino, len, pos, flags), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, len) + __field(unsigned long long, pos) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->len = len; + __entry->pos = pos; + __entry->flags = flags; + ), + TP_printk("%llu %u %llu 0x%x", + __entry->ino, __entry->len, __entry->pos, __entry->flags) +); + +TRACE_EVENT(ocfs2_write_begin_nolock, + TP_PROTO(unsigned long long ino, + long long i_size, unsigned int i_clusters, + unsigned long long pos, unsigned int len, + unsigned int flags, void *page, + unsigned int clusters, unsigned int extents_to_split), + TP_ARGS(ino, i_size, i_clusters, pos, len, flags, + page, clusters, extents_to_split), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(long long, i_size) + __field(unsigned int, i_clusters) + __field(unsigned long long, pos) + __field(unsigned int, len) + __field(unsigned int, flags) + __field(void *, page) + __field(unsigned int, clusters) + __field(unsigned int, extents_to_split) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->i_size = i_size; + __entry->i_clusters = i_clusters; + __entry->pos = pos; + __entry->len = len; + __entry->flags = flags; + __entry->page = page; + __entry->clusters = clusters; + __entry->extents_to_split = extents_to_split; + ), + TP_printk("%llu %lld %u %llu %u %u %p %u %u", + __entry->ino, __entry->i_size, __entry->i_clusters, + __entry->pos, __entry->len, + __entry->flags, __entry->page, __entry->clusters, + __entry->extents_to_split) +); + +TRACE_EVENT(ocfs2_write_end_inline, + TP_PROTO(unsigned long long ino, + unsigned long long pos, unsigned int copied, + unsigned int id_count, unsigned int features), + TP_ARGS(ino, pos, copied, id_count, features), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, pos) + __field(unsigned int, copied) + __field(unsigned int, id_count) + __field(unsigned int, features) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->pos = pos; + __entry->copied = copied; + __entry->id_count = id_count; + __entry->features = features; + ), + TP_printk("%llu %llu %u %u %u", + __entry->ino, __entry->pos, __entry->copied, + __entry->id_count, __entry->features) +); + +/* End of trace events for fs/ocfs2/aops.c. */ + +/* Trace events for fs/ocfs2/mmap.c. */ + +TRACE_EVENT(ocfs2_fault, + TP_PROTO(unsigned long long ino, + void *area, void *page, unsigned long pgoff), + TP_ARGS(ino, area, page, pgoff), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(void *, area) + __field(void *, page) + __field(unsigned long, pgoff) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->area = area; + __entry->page = page; + __entry->pgoff = pgoff; + ), + TP_printk("%llu %p %p %lu", + __entry->ino, __entry->area, __entry->page, __entry->pgoff) +); + +/* End of trace events for fs/ocfs2/mmap.c. */ + +/* Trace events for fs/ocfs2/file.c. */ + +DECLARE_EVENT_CLASS(ocfs2__file_ops, + TP_PROTO(void *inode, void *file, void *dentry, + unsigned long long ino, + unsigned int d_len, const unsigned char *d_name, + unsigned long long para), + TP_ARGS(inode, file, dentry, ino, d_len, d_name, para), + TP_STRUCT__entry( + __field(void *, inode) + __field(void *, file) + __field(void *, dentry) + __field(unsigned long long, ino) + __field(unsigned int, d_len) + __string(d_name, d_name) + __field(unsigned long long, para) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->file = file; + __entry->dentry = dentry; + __entry->ino = ino; + __entry->d_len = d_len; + __assign_str(d_name, d_name); + __entry->para = para; + ), + TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file, + __entry->dentry, __entry->ino, __entry->para, + __entry->d_len, __get_str(d_name)) +); + +#define DEFINE_OCFS2_FILE_OPS(name) \ +DEFINE_EVENT(ocfs2__file_ops, name, \ +TP_PROTO(void *inode, void *file, void *dentry, \ + unsigned long long ino, \ + unsigned int d_len, const unsigned char *d_name, \ + unsigned long long mode), \ + TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode)) + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_open); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_release); + +DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read); + +DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error); + +TRACE_EVENT(ocfs2_extend_allocation, + TP_PROTO(unsigned long long ip_blkno, unsigned long long size, + unsigned int clusters, unsigned int clusters_to_add, + int why, int restart_func), + TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func), + TP_STRUCT__entry( + __field(unsigned long long, ip_blkno) + __field(unsigned long long, size) + __field(unsigned int, clusters) + __field(unsigned int, clusters_to_add) + __field(int, why) + __field(int, restart_func) + ), + TP_fast_assign( + __entry->ip_blkno = ip_blkno; + __entry->size = size; + __entry->clusters = clusters; + __entry->clusters_to_add = clusters_to_add; + __entry->why = why; + __entry->restart_func = restart_func; + ), + TP_printk("%llu %llu %u %u %d %d", + __entry->ip_blkno, __entry->size, __entry->clusters, + __entry->clusters_to_add, __entry->why, __entry->restart_func) +); + +TRACE_EVENT(ocfs2_extend_allocation_end, + TP_PROTO(unsigned long long ino, + unsigned int di_clusters, unsigned long long di_size, + unsigned int ip_clusters, unsigned long long i_size), + TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, di_clusters) + __field(unsigned long long, di_size) + __field(unsigned int, ip_clusters) + __field(unsigned long long, i_size) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->di_clusters = di_clusters; + __entry->di_size = di_size; + __entry->ip_clusters = ip_clusters; + __entry->i_size = i_size; + ), + TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters, + __entry->di_size, __entry->ip_clusters, __entry->i_size) +); + +TRACE_EVENT(ocfs2_write_zero_page, + TP_PROTO(unsigned long long ino, + unsigned long long abs_from, unsigned long long abs_to, + unsigned long index, unsigned int zero_from, + unsigned int zero_to), + TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, abs_from) + __field(unsigned long long, abs_to) + __field(unsigned long, index) + __field(unsigned int, zero_from) + __field(unsigned int, zero_to) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->abs_from = abs_from; + __entry->abs_to = abs_to; + __entry->index = index; + __entry->zero_from = zero_from; + __entry->zero_to = zero_to; + ), + TP_printk("%llu %llu %llu %lu %u %u", __entry->ino, + __entry->abs_from, __entry->abs_to, + __entry->index, __entry->zero_from, __entry->zero_to) +); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend); + +TRACE_EVENT(ocfs2_setattr, + TP_PROTO(void *inode, void *dentry, + unsigned long long ino, + unsigned int d_len, const unsigned char *d_name, + unsigned int ia_valid, unsigned int ia_mode, + unsigned int ia_uid, unsigned int ia_gid), + TP_ARGS(inode, dentry, ino, d_len, d_name, + ia_valid, ia_mode, ia_uid, ia_gid), + TP_STRUCT__entry( + __field(void *, inode) + __field(void *, dentry) + __field(unsigned long long, ino) + __field(unsigned int, d_len) + __string(d_name, d_name) + __field(unsigned int, ia_valid) + __field(unsigned int, ia_mode) + __field(unsigned int, ia_uid) + __field(unsigned int, ia_gid) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->dentry = dentry; + __entry->ino = ino; + __entry->d_len = d_len; + __assign_str(d_name, d_name); + __entry->ia_valid = ia_valid; + __entry->ia_mode = ia_mode; + __entry->ia_uid = ia_uid; + __entry->ia_gid = ia_gid; + ), + TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode, + __entry->dentry, __entry->ino, __entry->d_len, + __get_str(d_name), __entry->ia_valid, __entry->ia_mode, + __entry->ia_uid, __entry->ia_gid) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2); + +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); + +TRACE_EVENT(ocfs2_prepare_inode_for_write, + TP_PROTO(unsigned long long ino, unsigned long long saved_pos, + int appending, unsigned long count, + int *direct_io, int *has_refcount), + TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned long long, saved_pos) + __field(int, appending) + __field(unsigned long, count) + __field(int, direct_io) + __field(int, has_refcount) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->saved_pos = saved_pos; + __entry->appending = appending; + __entry->count = count; + __entry->direct_io = direct_io ? *direct_io : -1; + __entry->has_refcount = has_refcount ? *has_refcount : -1; + ), + TP_printk("%llu %llu %d %lu %d %d", __entry->ino, + __entry->saved_pos, __entry->appending, __entry->count, + __entry->direct_io, __entry->has_refcount) +); + +DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); + +/* End of trace events for fs/ocfs2/file.c. */ + +/* Trace events for fs/ocfs2/inode.c. */ + +TRACE_EVENT(ocfs2_iget_begin, + TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type), + TP_ARGS(ino, flags, sysfile_type), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(unsigned int, flags) + __field(int, sysfile_type) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->flags = flags; + __entry->sysfile_type = sysfile_type; + ), + TP_printk("%llu %u %d", __entry->ino, + __entry->flags, __entry->sysfile_type) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked); + +TRACE_EVENT(ocfs2_iget_end, + TP_PROTO(void *inode, unsigned long long ino), + TP_ARGS(inode, ino), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, ino) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + ), + TP_printk("%p %llu", __entry->inode, __entry->ino) +); + +TRACE_EVENT(ocfs2_find_actor, + TP_PROTO(void *inode, unsigned long long ino, + void *args, unsigned long long fi_blkno), + TP_ARGS(inode, ino, args, fi_blkno), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, ino) + __field(void *, args) + __field(unsigned long long, fi_blkno) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->args = args; + __entry->fi_blkno = fi_blkno; + ), + TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino, + __entry->args, __entry->fi_blkno) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block); + +TRACE_EVENT(ocfs2_inode_is_valid_to_delete, + TP_PROTO(void *task, void *dc_task, unsigned long long ino, + unsigned int flags), + TP_ARGS(task, dc_task, ino, flags), + TP_STRUCT__entry( + __field(void *, task) + __field(void *, dc_task) + __field(unsigned long long, ino) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->task = task; + __entry->dc_task = dc_task; + __entry->ino = ino; + __entry->flags = flags; + ), + TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task, + __entry->ino, __entry->flags) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode); + +TRACE_EVENT(ocfs2_inode_revalidate, + TP_PROTO(void *inode, unsigned long long ino, + unsigned int flags), + TP_ARGS(inode, ino, flags), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, ino) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->ino = ino; + __entry->flags = flags; + ), + TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty); + +/* End of trace events for fs/ocfs2/inode.c. */ + +/* Trace events for fs/ocfs2/extent_map.c. */ + +TRACE_EVENT(ocfs2_read_virt_blocks, + TP_PROTO(void *inode, unsigned long long vblock, int nr, + void *bhs, unsigned int flags, void *validate), + TP_ARGS(inode, vblock, nr, bhs, flags, validate), + TP_STRUCT__entry( + __field(void *, inode) + __field(unsigned long long, vblock) + __field(int, nr) + __field(void *, bhs) + __field(unsigned int, flags) + __field(void *, validate) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->vblock = vblock; + __entry->nr = nr; + __entry->bhs = bhs; + __entry->flags = flags; + __entry->validate = validate; + ), + TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock, + __entry->nr, __entry->bhs, __entry->flags, __entry->validate) +); + +/* End of trace events for fs/ocfs2/extent_map.c. */ + +/* Trace events for fs/ocfs2/slot_map.c. */ + +DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block); + +DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot); + +/* End of trace events for fs/ocfs2/slot_map.c. */ + +/* Trace events for fs/ocfs2/heartbeat.c. */ + +DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down); + +/* End of trace events for fs/ocfs2/heartbeat.c. */ + +/* Trace events for fs/ocfs2/super.c. */ + +TRACE_EVENT(ocfs2_remount, + TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags), + TP_ARGS(s_flags, osb_flags, flags), + TP_STRUCT__entry( + __field(unsigned long, s_flags) + __field(unsigned long, osb_flags) + __field(int, flags) + ), + TP_fast_assign( + __entry->s_flags = s_flags; + __entry->osb_flags = osb_flags; + __entry->flags = flags; + ), + TP_printk("%lu %lu %d", __entry->s_flags, + __entry->osb_flags, __entry->flags) +); + +TRACE_EVENT(ocfs2_fill_super, + TP_PROTO(void *sb, void *data, int silent), + TP_ARGS(sb, data, silent), + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, data) + __field(int, silent) + ), + TP_fast_assign( + __entry->sb = sb; + __entry->data = data; + __entry->silent = silent; + ), + TP_printk("%p %p %d", __entry->sb, + __entry->data, __entry->silent) +); + +TRACE_EVENT(ocfs2_parse_options, + TP_PROTO(int is_remount, char *options), + TP_ARGS(is_remount, options), + TP_STRUCT__entry( + __field(int, is_remount) + __string(options, options) + ), + TP_fast_assign( + __entry->is_remount = is_remount; + __assign_str(options, options); + ), + TP_printk("%d %s", __entry->is_remount, __get_str(options)) +); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super); + +TRACE_EVENT(ocfs2_statfs, + TP_PROTO(void *sb, void *buf), + TP_ARGS(sb, buf), + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, buf) + ), + TP_fast_assign( + __entry->sb = sb; + __entry->buf = buf; + ), + TP_printk("%p %p", __entry->sb, __entry->buf) +); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume); + +TRACE_EVENT(ocfs2_initialize_super, + TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir, + unsigned long long system_dir, int cluster_bits), + TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits), + TP_STRUCT__entry( + __string(label, label) + __string(uuid_str, uuid_str) + __field(unsigned long long, root_dir) + __field(unsigned long long, system_dir) + __field(int, cluster_bits) + ), + TP_fast_assign( + __assign_str(label, label); + __assign_str(uuid_str, uuid_str); + __entry->root_dir = root_dir; + __entry->system_dir = system_dir; + __entry->cluster_bits = cluster_bits; + ), + TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str), + __entry->root_dir, __entry->system_dir, __entry->cluster_bits) +); + +/* End of trace events for fs/ocfs2/super.c. */ + +/* Trace events for fs/ocfs2/xattr.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation); + +TRACE_EVENT(ocfs2_init_xattr_set_ctxt, + TP_PROTO(const char *name, int meta, int clusters, int credits), + TP_ARGS(name, meta, clusters, credits), + TP_STRUCT__entry( + __string(name, name) + __field(int, meta) + __field(int, clusters) + __field(int, credits) + ), + TP_fast_assign( + __assign_str(name, name); + __entry->meta = meta; + __entry->clusters = clusters; + __entry->credits = credits; + ), + TP_printk("%s %d %d %d", __get_str(name), __entry->meta, + __entry->clusters, __entry->credits) +); + +DECLARE_EVENT_CLASS(ocfs2__xattr_find, + TP_PROTO(unsigned long long ino, const char *name, int name_index, + unsigned int hash, unsigned long long location, + int xe_index), + TP_ARGS(ino, name, name_index, hash, location, xe_index), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __string(name, name) + __field(int, name_index) + __field(unsigned int, hash) + __field(unsigned long long, location) + __field(int, xe_index) + ), + TP_fast_assign( + __entry->ino = ino; + __assign_str(name, name); + __entry->name_index = name_index; + __entry->hash = hash; + __entry->location = location; + __entry->xe_index = xe_index; + ), + TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name), + __entry->name_index, __entry->hash, __entry->location, + __entry->xe_index) +); + +#define DEFINE_OCFS2_XATTR_FIND_EVENT(name) \ +DEFINE_EVENT(ocfs2__xattr_find, name, \ +TP_PROTO(unsigned long long ino, const char *name, int name_index, \ + unsigned int hash, unsigned long long bucket, \ + int xe_index), \ + TP_ARGS(ino, name, name_index, hash, bucket, xe_index)) + +DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find); + +DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find); + +DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block); + +DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket); + +DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec); + +/* End of trace events for fs/ocfs2/xattr.c. */ + +/* Trace events for fs/ocfs2/reservations.c. */ + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert); + +DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end); + +TRACE_EVENT(ocfs2_resv_find_window_begin, + TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal, + unsigned int wanted, int empty_root), + TP_ARGS(r_start, r_end, goal, wanted, empty_root), + TP_STRUCT__entry( + __field(unsigned int, r_start) + __field(unsigned int, r_end) + __field(unsigned int, goal) + __field(unsigned int, wanted) + __field(int, empty_root) + ), + TP_fast_assign( + __entry->r_start = r_start; + __entry->r_end = r_end; + __entry->goal = goal; + __entry->wanted = wanted; + __entry->empty_root = empty_root; + ), + TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end, + __entry->goal, __entry->wanted, __entry->empty_root) +); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin); + +TRACE_EVENT(ocfs2_cannibalize_resv_end, + TP_PROTO(unsigned int start, unsigned int end, unsigned int len, + unsigned int last_start, unsigned int last_len), + TP_ARGS(start, end, len, last_start, last_len), + TP_STRUCT__entry( + __field(unsigned int, start) + __field(unsigned int, end) + __field(unsigned int, len) + __field(unsigned int, last_start) + __field(unsigned int, last_len) + ), + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->len = len; + __entry->last_start = last_start; + __entry->last_len = last_len; + ), + TP_printk("%u %u %u %u %u", __entry->start, __entry->end, + __entry->len, __entry->last_start, __entry->last_len) +); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits); + +TRACE_EVENT(ocfs2_resmap_claimed_bits_begin, + TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen, + unsigned int r_start, unsigned int r_end, unsigned int r_len, + unsigned int last_start, unsigned int last_len), + TP_ARGS(cstart, cend, clen, r_start, r_end, + r_len, last_start, last_len), + TP_STRUCT__entry( + __field(unsigned int, cstart) + __field(unsigned int, cend) + __field(unsigned int, clen) + __field(unsigned int, r_start) + __field(unsigned int, r_end) + __field(unsigned int, r_len) + __field(unsigned int, last_start) + __field(unsigned int, last_len) + ), + TP_fast_assign( + __entry->cstart = cstart; + __entry->cend = cend; + __entry->clen = clen; + __entry->r_start = r_start; + __entry->r_end = r_end; + __entry->r_len = r_len; + __entry->last_start = last_start; + __entry->last_len = last_len; + ), + TP_printk("%u %u %u %u %u %u %u %u", + __entry->cstart, __entry->cend, __entry->clen, + __entry->r_start, __entry->r_end, __entry->r_len, + __entry->last_start, __entry->last_len) +); + +TRACE_EVENT(ocfs2_resmap_claimed_bits_end, + TP_PROTO(unsigned int start, unsigned int end, unsigned int len, + unsigned int last_start, unsigned int last_len), + TP_ARGS(start, end, len, last_start, last_len), + TP_STRUCT__entry( + __field(unsigned int, start) + __field(unsigned int, end) + __field(unsigned int, len) + __field(unsigned int, last_start) + __field(unsigned int, last_len) + ), + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->len = len; + __entry->last_start = last_start; + __entry->last_len = last_len; + ), + TP_printk("%u %u %u %u %u", __entry->start, __entry->end, + __entry->len, __entry->last_start, __entry->last_len) +); + +/* End of trace events for fs/ocfs2/reservations.c. */ + +/* Trace events for fs/ocfs2/quota_local.c. */ + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file); + +DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot); + +/* End of trace events for fs/ocfs2/quota_local.c. */ + +/* Trace events for fs/ocfs2/quota_global.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block); + +TRACE_EVENT(ocfs2_sync_dquot, + TP_PROTO(unsigned int dq_id, long long dqb_curspace, + long long spacechange, long long curinodes, + long long inodechange), + TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange), + TP_STRUCT__entry( + __field(unsigned int, dq_id) + __field(long long, dqb_curspace) + __field(long long, spacechange) + __field(long long, curinodes) + __field(long long, inodechange) + ), + TP_fast_assign( + __entry->dq_id = dq_id; + __entry->dqb_curspace = dqb_curspace; + __entry->spacechange = spacechange; + __entry->curinodes = curinodes; + __entry->inodechange = inodechange; + ), + TP_printk("%u %lld %lld %lld %lld", __entry->dq_id, + __entry->dqb_curspace, __entry->spacechange, + __entry->curinodes, __entry->inodechange) +); + +TRACE_EVENT(ocfs2_sync_dquot_helper, + TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type, + const char *s_id), + TP_ARGS(dq_id, dq_type, type, s_id), + + TP_STRUCT__entry( + __field(unsigned int, dq_id) + __field(unsigned int, dq_type) + __field(unsigned long, type) + __string(s_id, s_id) + ), + TP_fast_assign( + __entry->dq_id = dq_id; + __entry->dq_type = dq_type; + __entry->type = type; + __assign_str(s_id, s_id); + ), + TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type, + __entry->type, __get_str(s_id)) +); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot); + +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty); + +/* End of trace events for fs/ocfs2/quota_global.c. */ + +/* Trace events for fs/ocfs2/dir.c. */ +DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el); + +TRACE_EVENT(ocfs2_dx_dir_search, + TP_PROTO(unsigned long long ino, int namelen, const char *name, + unsigned int major_hash, unsigned int minor_hash, + unsigned long long blkno), + TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(int, namelen) + __string(name, name) + __field(unsigned int, major_hash) + __field(unsigned int,minor_hash) + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->namelen = namelen; + __assign_str(name, name); + __entry->major_hash = major_hash; + __entry->minor_hash = minor_hash; + __entry->blkno = blkno; + ), + TP_printk("%llu %.*s %u %u %llu", __entry->ino, + __entry->namelen, __get_str(name), + __entry->major_hash, __entry->minor_hash, __entry->blkno) +); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir); + +TRACE_EVENT(ocfs2_find_files_on_disk, + TP_PROTO(int namelen, const char *name, void *blkno, + unsigned long long dir), + TP_ARGS(namelen, name, blkno, dir), + TP_STRUCT__entry( + __field(int, namelen) + __string(name, name) + __field(void *, blkno) + __field(unsigned long long, dir) + ), + TP_fast_assign( + __entry->namelen = namelen; + __assign_str(name, name); + __entry->blkno = blkno; + __entry->dir = dir; + ), + TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name), + __entry->blkno, __entry->dir) +); + +TRACE_EVENT(ocfs2_check_dir_for_entry, + TP_PROTO(unsigned long long dir, int namelen, const char *name), + TP_ARGS(dir, namelen, name), + TP_STRUCT__entry( + __field(unsigned long long, dir) + __field(int, namelen) + __string(name, name) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->namelen = namelen; + __assign_str(name, name); + ), + TP_printk("%llu %.*s", __entry->dir, + __entry->namelen, __get_str(name)) +); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster); + +TRACE_EVENT(ocfs2_dx_dir_index_root_block, + TP_PROTO(unsigned long long dir, + unsigned int major_hash, unsigned int minor_hash, + int namelen, const char *name, unsigned int num_used), + TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used), + TP_STRUCT__entry( + __field(unsigned long long, dir) + __field(unsigned int, major_hash) + __field(unsigned int, minor_hash) + __field(int, namelen) + __string(name, name) + __field(unsigned int, num_used) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->major_hash = major_hash; + __entry->minor_hash = minor_hash; + __entry->namelen = namelen; + __assign_str(name, name); + __entry->num_used = num_used; + ), + TP_printk("%llu %x %x %.*s %u", __entry->dir, + __entry->major_hash, __entry->minor_hash, + __entry->namelen, __get_str(name), __entry->num_used) +); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert); + +/* End of trace events for fs/ocfs2/dir.c. */ + +/* Trace events for fs/ocfs2/namei.c. */ + +DECLARE_EVENT_CLASS(ocfs2__dentry_ops, + TP_PROTO(void *dir, void *dentry, int name_len, const char *name, + unsigned long long dir_blkno, unsigned long long extra), + TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra), + TP_STRUCT__entry( + __field(void *, dir) + __field(void *, dentry) + __field(int, name_len) + __string(name, name) + __field(unsigned long long, dir_blkno) + __field(unsigned long long, extra) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->dentry = dentry; + __entry->name_len = name_len; + __assign_str(name, name); + __entry->dir_blkno = dir_blkno; + __entry->extra = extra; + ), + TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry, + __entry->name_len, __get_str(name), + __entry->dir_blkno, __entry->extra) +); + +#define DEFINE_OCFS2_DENTRY_OPS(name) \ +DEFINE_EVENT(ocfs2__dentry_ops, name, \ +TP_PROTO(void *dir, void *dentry, int name_len, const char *name, \ + unsigned long long dir_blkno, unsigned long long extra), \ + TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra)) + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_create); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create); + +DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret); + +TRACE_EVENT(ocfs2_mknod, + TP_PROTO(void *dir, void *dentry, int name_len, const char *name, + unsigned long long dir_blkno, unsigned long dev, int mode), + TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode), + TP_STRUCT__entry( + __field(void *, dir) + __field(void *, dentry) + __field(int, name_len) + __string(name, name) + __field(unsigned long long, dir_blkno) + __field(unsigned long, dev) + __field(int, mode) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->dentry = dentry; + __entry->name_len = name_len; + __assign_str(name, name); + __entry->dir_blkno = dir_blkno; + __entry->dev = dev; + __entry->mode = mode; + ), + TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry, + __entry->name_len, __get_str(name), + __entry->dir_blkno, __entry->dev, __entry->mode) +); + +TRACE_EVENT(ocfs2_link, + TP_PROTO(unsigned long long ino, int old_len, const char *old_name, + int name_len, const char *name), + TP_ARGS(ino, old_len, old_name, name_len, name), + TP_STRUCT__entry( + __field(unsigned long long, ino) + __field(int, old_len) + __string(old_name, old_name) + __field(int, name_len) + __string(name, name) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->old_len = old_len; + __assign_str(old_name, old_name); + __entry->name_len = name_len; + __assign_str(name, name); + ), + TP_printk("%llu %.*s %.*s", __entry->ino, + __entry->old_len, __get_str(old_name), + __entry->name_len, __get_str(name)) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end); + +TRACE_EVENT(ocfs2_rename, + TP_PROTO(void *old_dir, void *old_dentry, + void *new_dir, void *new_dentry, + int old_len, const char *old_name, + int new_len, const char *new_name), + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, + old_len, old_name, new_len, new_name), + TP_STRUCT__entry( + __field(void *, old_dir) + __field(void *, old_dentry) + __field(void *, new_dir) + __field(void *, new_dentry) + __field(int, old_len) + __string(old_name, old_name) + __field(int, new_len) + __string(new_name, new_name) + ), + TP_fast_assign( + __entry->old_dir = old_dir; + __entry->old_dentry = old_dentry; + __entry->new_dir = new_dir; + __entry->new_dentry = new_dentry; + __entry->old_len = old_len; + __assign_str(old_name, old_name); + __entry->new_len = new_len; + __assign_str(new_name, new_name); + ), + TP_printk("%p %p %p %p %.*s %.*s", + __entry->old_dir, __entry->old_dentry, + __entry->new_dir, __entry->new_dentry, + __entry->old_len, __get_str(old_name), + __entry->new_len, __get_str(new_name)) +); + +TRACE_EVENT(ocfs2_rename_target_exists, + TP_PROTO(int new_len, const char *new_name), + TP_ARGS(new_len, new_name), + TP_STRUCT__entry( + __field(int, new_len) + __string(new_name, new_name) + ), + TP_fast_assign( + __entry->new_len = new_len; + __assign_str(new_name, new_name); + ), + TP_printk("%.*s", __entry->new_len, __get_str(new_name)) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree); + +TRACE_EVENT(ocfs2_rename_over_existing, + TP_PROTO(unsigned long long new_blkno, void *new_bh, + unsigned long long newdi_blkno), + TP_ARGS(new_blkno, new_bh, newdi_blkno), + TP_STRUCT__entry( + __field(unsigned long long, new_blkno) + __field(void *, new_bh) + __field(unsigned long long, newdi_blkno) + ), + TP_fast_assign( + __entry->new_blkno = new_blkno; + __entry->new_bh = new_bh; + __entry->newdi_blkno = newdi_blkno; + ), + TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh, + __entry->newdi_blkno) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data); + +TRACE_EVENT(ocfs2_symlink_begin, + TP_PROTO(void *dir, void *dentry, const char *symname, + int len, const char *name), + TP_ARGS(dir, dentry, symname, len, name), + TP_STRUCT__entry( + __field(void *, dir) + __field(void *, dentry) + __field(const char *, symname) + __field(int, len) + __string(name, name) + ), + TP_fast_assign( + __entry->dir = dir; + __entry->dentry = dentry; + __entry->symname = symname; + __entry->len = len; + __assign_str(name, name); + ), + TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry, + __entry->symname, __entry->len, __get_str(name)) +); + +TRACE_EVENT(ocfs2_blkno_stringify, + TP_PROTO(unsigned long long blkno, const char *name, int namelen), + TP_ARGS(blkno, name, namelen), + TP_STRUCT__entry( + __field(unsigned long long, blkno) + __string(name, name) + __field(int, namelen) + ), + TP_fast_assign( + __entry->blkno = blkno; + __assign_str(name, name); + __entry->namelen = namelen; + ), + TP_printk("%llu %s %d", __entry->blkno, __get_str(name), + __entry->namelen) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end); + +TRACE_EVENT(ocfs2_orphan_del, + TP_PROTO(unsigned long long dir, const char *name, int namelen), + TP_ARGS(dir, name, namelen), + TP_STRUCT__entry( + __field(unsigned long long, dir) + __string(name, name) + __field(int, namelen) + ), + TP_fast_assign( + __entry->dir = dir; + __assign_str(name, name); + __entry->namelen = namelen; + ), + TP_printk("%llu %s %d", __entry->dir, __get_str(name), + __entry->namelen) +); + +/* End of trace events for fs/ocfs2/namei.c. */ + +/* Trace events for fs/ocfs2/dcache.c. */ + +TRACE_EVENT(ocfs2_dentry_revalidate, + TP_PROTO(void *dentry, int len, const char *name), + TP_ARGS(dentry, len, name), + TP_STRUCT__entry( + __field(void *, dentry) + __field(int, len) + __string(name, name) + ), + TP_fast_assign( + __entry->dentry = dentry; + __entry->len = len; + __assign_str(name, name); + ), + TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name)) +); + +TRACE_EVENT(ocfs2_dentry_revalidate_negative, + TP_PROTO(int len, const char *name, unsigned long pgen, + unsigned long gen), + TP_ARGS(len, name, pgen, gen), + TP_STRUCT__entry( + __field(int, len) + __string(name, name) + __field(unsigned long, pgen) + __field(unsigned long, gen) + ), + TP_fast_assign( + __entry->len = len; + __assign_str(name, name); + __entry->pgen = pgen; + __entry->gen = gen; + ), + TP_printk("%.*s %lu %lu", __entry->len, __get_str(name), + __entry->pgen, __entry->gen) +); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete); + +DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata); + +DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret); + +TRACE_EVENT(ocfs2_find_local_alias, + TP_PROTO(int len, const char *name), + TP_ARGS(len, name), + TP_STRUCT__entry( + __field(int, len) + __string(name, name) + ), + TP_fast_assign( + __entry->len = len; + __assign_str(name, name); + ), + TP_printk("%.*s", __entry->len, __get_str(name)) +); + +TRACE_EVENT(ocfs2_dentry_attach_lock, + TP_PROTO(int len, const char *name, + unsigned long long parent, void *fsdata), + TP_ARGS(len, name, parent, fsdata), + TP_STRUCT__entry( + __field(int, len) + __string(name, name) + __field(unsigned long long, parent) + __field(void *, fsdata) + ), + TP_fast_assign( + __entry->len = len; + __assign_str(name, name); + __entry->parent = parent; + __entry->fsdata = fsdata; + ), + TP_printk("%.*s %llu %p", __entry->len, __get_str(name), + __entry->parent, __entry->fsdata) +); + +TRACE_EVENT(ocfs2_dentry_attach_lock_found, + TP_PROTO(const char *name, unsigned long long parent, + unsigned long long ino), + TP_ARGS(name, parent, ino), + TP_STRUCT__entry( + __string(name, name) + __field(unsigned long long, parent) + __field(unsigned long long, ino) + ), + TP_fast_assign( + __assign_str(name, name); + __entry->parent = parent; + __entry->ino = ino; + ), + TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino) +); +/* End of trace events for fs/ocfs2/dcache.c. */ + +/* Trace events for fs/ocfs2/export.c. */ + +TRACE_EVENT(ocfs2_get_dentry_begin, + TP_PROTO(void *sb, void *handle, unsigned long long blkno), + TP_ARGS(sb, handle, blkno), + TP_STRUCT__entry( + __field(void *, sb) + __field(void *, handle) + __field(unsigned long long, blkno) + ), + TP_fast_assign( + __entry->sb = sb; + __entry->handle = handle; + __entry->blkno = blkno; + ), + TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end); + +TRACE_EVENT(ocfs2_get_parent, + TP_PROTO(void *child, int len, const char *name, + unsigned long long ino), + TP_ARGS(child, len, name, ino), + TP_STRUCT__entry( + __field(void *, child) + __field(int, len) + __string(name, name) + __field(unsigned long long, ino) + ), + TP_fast_assign( + __entry->child = child; + __entry->len = len; + __assign_str(name, name); + __entry->ino = ino; + ), + TP_printk("%p %.*s %llu", __entry->child, __entry->len, + __get_str(name), __entry->ino) +); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end); + +TRACE_EVENT(ocfs2_encode_fh_begin, + TP_PROTO(void *dentry, int name_len, const char *name, + void *fh, int len, int connectable), + TP_ARGS(dentry, name_len, name, fh, len, connectable), + TP_STRUCT__entry( + __field(void *, dentry) + __field(int, name_len) + __string(name, name) + __field(void *, fh) + __field(int, len) + __field(int, connectable) + ), + TP_fast_assign( + __entry->dentry = dentry; + __entry->name_len = name_len; + __assign_str(name, name); + __entry->fh = fh; + __entry->len = len; + __entry->connectable = connectable; + ), + TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len, + __get_str(name), __entry->fh, __entry->len, + __entry->connectable) +); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent); + +DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type); + +/* End of trace events for fs/ocfs2/export.c. */ + +/* Trace events for fs/ocfs2/journal.c. */ + +DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin); + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); + +DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init); + +DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen); + +DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown); + +DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery); + +DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end); + +TRACE_EVENT(ocfs2_complete_recovery_slot, + TP_PROTO(int slot, unsigned long long la_ino, + unsigned long long tl_ino, void *qrec), + TP_ARGS(slot, la_ino, tl_ino, qrec), + TP_STRUCT__entry( + __field(int, slot) + __field(unsigned long long, la_ino) + __field(unsigned long long, tl_ino) + __field(void *, qrec) + ), + TP_fast_assign( + __entry->slot = slot; + __entry->la_ino = la_ino; + __entry->tl_ino = tl_ino; + __entry->qrec = qrec; + ), + TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino, + __entry->tl_ino, __entry->qrec) +); + +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node); + +DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end); + +TRACE_EVENT(ocfs2_recovery_thread, + TP_PROTO(int node_num, int osb_node_num, int disable, + void *recovery_thread, int map_set), + TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set), + TP_STRUCT__entry( + __field(int, node_num) + __field(int, osb_node_num) + __field(int,disable) + __field(void *, recovery_thread) + __field(int,map_set) + ), + TP_fast_assign( + __entry->node_num = node_num; + __entry->osb_node_num = osb_node_num; + __entry->disable = disable; + __entry->recovery_thread = recovery_thread; + __entry->map_set = map_set; + ), + TP_printk("%d %d %d %p %d", __entry->node_num, + __entry->osb_node_num, __entry->disable, + __entry->recovery_thread, __entry->map_set) +); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered); + +DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err); + +DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip); + +DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin); + +DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir); + +DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput); + +DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount); + +/* End of trace events for fs/ocfs2/journal.c. */ + +/* Trace events for fs/ocfs2/buffer_head_io.c. */ + +DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync); + +DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh); + +DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end); + +TRACE_EVENT(ocfs2_write_block, + TP_PROTO(unsigned long long block, void *ci), + TP_ARGS(block, ci), + TP_STRUCT__entry( + __field(unsigned long long, block) + __field(void *, ci) + ), + TP_fast_assign( + __entry->block = block; + __entry->ci = ci; + ), + TP_printk("%llu %p", __entry->block, __entry->ci) +); + +TRACE_EVENT(ocfs2_read_blocks_begin, + TP_PROTO(void *ci, unsigned long long block, + unsigned int nr, int flags), + TP_ARGS(ci, block, nr, flags), + TP_STRUCT__entry( + __field(void *, ci) + __field(unsigned long long, block) + __field(unsigned int, nr) + __field(int, flags) + ), + TP_fast_assign( + __entry->ci = ci; + __entry->block = block; + __entry->nr = nr; + __entry->flags = flags; + ), + TP_printk("%p %llu %u %d", __entry->ci, __entry->block, + __entry->nr, __entry->flags) +); + +/* End of trace events for fs/ocfs2/buffer_head_io.c. */ + +/* Trace events for fs/ocfs2/uptodate.c. */ + +DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin); + +TRACE_EVENT(ocfs2_buffer_cached_end, + TP_PROTO(int index, void *item), + TP_ARGS(index, item), + TP_STRUCT__entry( + __field(int, index) + __field(void *, item) + ), + TP_fast_assign( + __entry->index = index; + __entry->item = item; + ), + TP_printk("%d %p", __entry->index, __entry->item) +); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array); + +DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin); + +DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array); + +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree); + +DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache); + +/* End of trace events for fs/ocfs2/uptodate.c. */ +#endif /* _TRACE_OCFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE ocfs2_trace +#include <trace/define_trace.h> diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index a73f6416648..279aef68025 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -11,7 +11,6 @@ #include <linux/writeback.h> #include <linux/workqueue.h> -#define MLOG_MASK_PREFIX ML_QUOTA #include <cluster/masklog.h> #include "ocfs2_fs.h" @@ -27,6 +26,7 @@ #include "super.h" #include "buffer_head_io.h" #include "quota.h" +#include "ocfs2_trace.h" /* * Locking of quotas with OCFS2 is rather complex. Here are rules that @@ -130,8 +130,7 @@ int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh) struct ocfs2_disk_dqtrailer *dqt = ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); - mlog(0, "Validating quota block %llu\n", - (unsigned long long)bh->b_blocknr); + trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); @@ -341,8 +340,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type) u64 pcount; int status; - mlog_entry_void(); - /* Read global header */ gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], OCFS2_INVALID_SLOT); @@ -402,7 +399,8 @@ int ocfs2_global_read_info(struct super_block *sb, int type) msecs_to_jiffies(oinfo->dqi_syncms)); out_err: - mlog_exit(status); + if (status) + mlog_errno(status); return status; out_unlock: ocfs2_unlock_global_qf(oinfo, 0); @@ -508,9 +506,10 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) olditime = dquot->dq_dqb.dqb_itime; oldbtime = dquot->dq_dqb.dqb_btime; ocfs2_global_disk2memdqb(dquot, &dqblk); - mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n", - dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange, - dquot->dq_dqb.dqb_curinodes, (long long)inodechange); + trace_ocfs2_sync_dquot(dquot->dq_id, dquot->dq_dqb.dqb_curspace, + (long long)spacechange, + dquot->dq_dqb.dqb_curinodes, + (long long)inodechange); if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) dquot->dq_dqb.dqb_curspace += spacechange; if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) @@ -594,8 +593,8 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) struct ocfs2_super *osb = OCFS2_SB(sb); int status = 0; - mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id, - dquot->dq_type, type, sb->s_id); + trace_ocfs2_sync_dquot_helper(dquot->dq_id, dquot->dq_type, + type, sb->s_id); if (type != dquot->dq_type) goto out; status = ocfs2_lock_global_qf(oinfo, 1); @@ -621,7 +620,6 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: - mlog_exit(status); return status; } @@ -647,7 +645,7 @@ static int ocfs2_write_dquot(struct dquot *dquot) struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; - mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); + trace_ocfs2_write_dquot(dquot->dq_id, dquot->dq_type); handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS); if (IS_ERR(handle)) { @@ -660,7 +658,6 @@ static int ocfs2_write_dquot(struct dquot *dquot) mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); ocfs2_commit_trans(osb, handle); out: - mlog_exit(status); return status; } @@ -686,7 +683,7 @@ static int ocfs2_release_dquot(struct dquot *dquot) struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; - mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); + trace_ocfs2_release_dquot(dquot->dq_id, dquot->dq_type); mutex_lock(&dquot->dq_lock); /* Check whether we are not racing with some other dqget() */ @@ -722,7 +719,8 @@ out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: mutex_unlock(&dquot->dq_lock); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -743,7 +741,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) int need_alloc = ocfs2_global_qinit_alloc(sb, type); handle_t *handle; - mlog_entry("id=%u, type=%d", dquot->dq_id, type); + trace_ocfs2_acquire_dquot(dquot->dq_id, type); mutex_lock(&dquot->dq_lock); /* * We need an exclusive lock, because we're going to update use count @@ -809,7 +807,8 @@ out_dq: set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out: mutex_unlock(&dquot->dq_lock); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -829,7 +828,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(sb); - mlog_entry("id=%u, type=%d", dquot->dq_id, type); + trace_ocfs2_mark_dquot_dirty(dquot->dq_id, type); /* In case user set some limits, sync dquot immediately to global * quota file so that information propagates quicker */ @@ -866,7 +865,8 @@ out_dlock: out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -877,8 +877,6 @@ static int ocfs2_write_info(struct super_block *sb, int type) int status = 0; struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; - mlog_entry_void(); - status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; @@ -893,7 +891,8 @@ static int ocfs2_write_info(struct super_block *sb, int type) out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index dc78764ccc4..dc8007fc924 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -8,7 +8,6 @@ #include <linux/quotaops.h> #include <linux/module.h> -#define MLOG_MASK_PREFIX ML_QUOTA #include <cluster/masklog.h> #include "ocfs2_fs.h" @@ -23,6 +22,7 @@ #include "quota.h" #include "uptodate.h" #include "super.h" +#include "ocfs2_trace.h" /* Number of local quota structures per block */ static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) @@ -475,7 +475,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, struct ocfs2_recovery_chunk *rchunk, *next; qsize_t spacechange, inodechange; - mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type); + trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type); list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { chunk = rchunk->rc_chunk; @@ -575,7 +575,8 @@ out_put_bh: } if (status < 0) free_recovery_list(&(rec->r_list[type])); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -600,7 +601,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, for (type = 0; type < MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; - mlog(0, "Recovering quota in slot %d\n", slot_num); + trace_ocfs2_finish_quota_recovery(slot_num); lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); if (!lqinode) { status = -ENOENT; @@ -882,9 +883,10 @@ static void olq_set_dquot(struct buffer_head *bh, void *private) dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - od->dq_originodes); spin_unlock(&dq_data_lock); - mlog(0, "Writing local dquot %u space %lld inodes %lld\n", - od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod), - (long long)le64_to_cpu(dqblk->dqb_inodemod)); + trace_olq_set_dquot( + (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod), + (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod), + od->dq_dquot.dq_id); } /* Write dquot to local quota file */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c384d634872..5d32749c896 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -16,7 +16,6 @@ */ #include <linux/sort.h> -#define MLOG_MASK_PREFIX ML_REFCOUNT #include <cluster/masklog.h> #include "ocfs2.h" #include "inode.h" @@ -34,6 +33,7 @@ #include "aops.h" #include "xattr.h" #include "namei.h" +#include "ocfs2_trace.h" #include <linux/bio.h> #include <linux/blkdev.h> @@ -84,8 +84,7 @@ static int ocfs2_validate_refcount_block(struct super_block *sb, struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)bh->b_data; - mlog(0, "Validating refcount block %llu\n", - (unsigned long long)bh->b_blocknr); + trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); @@ -545,8 +544,8 @@ void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) while ((node = rb_last(root)) != NULL) { tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); - mlog(0, "Purge tree %llu\n", - (unsigned long long) tree->rf_blkno); + trace_ocfs2_purge_refcount_trees( + (unsigned long long) tree->rf_blkno); rb_erase(&tree->rf_node, root); ocfs2_free_refcount_tree(tree); @@ -575,7 +574,8 @@ static int ocfs2_create_refcount_tree(struct inode *inode, BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); - mlog(0, "create tree for inode %lu\n", inode->i_ino); + trace_ocfs2_create_refcount_tree( + (unsigned long long)OCFS2_I(inode)->ip_blkno); ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); if (ret) { @@ -646,8 +646,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, di->i_refcount_loc = cpu_to_le64(first_blkno); spin_unlock(&oi->ip_lock); - mlog(0, "created tree for inode %lu, refblock %llu\n", - inode->i_ino, (unsigned long long)first_blkno); + trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno); ocfs2_journal_dirty(handle, di_bh); @@ -1256,8 +1255,9 @@ static int ocfs2_change_refcount_rec(handle_t *handle, goto out; } - mlog(0, "change index %d, old count %u, change %d\n", index, - le32_to_cpu(rec->r_refcount), change); + trace_ocfs2_change_refcount_rec( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + index, le32_to_cpu(rec->r_refcount), change); le32_add_cpu(&rec->r_refcount, change); if (!rec->r_refcount) { @@ -1353,8 +1353,8 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, ocfs2_journal_dirty(handle, ref_root_bh); - mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno, - le16_to_cpu(new_rb->rf_records.rl_used)); + trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno, + le16_to_cpu(new_rb->rf_records.rl_used)); *ref_leaf_bh = new_bh; new_bh = NULL; @@ -1466,9 +1466,9 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, (struct ocfs2_refcount_block *)new_bh->b_data; struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; - mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n", - (unsigned long long)ref_leaf_bh->b_blocknr, - le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); + trace_ocfs2_divide_leaf_refcount_block( + (unsigned long long)ref_leaf_bh->b_blocknr, + le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); /* * XXX: Improvement later. @@ -1601,8 +1601,8 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); - mlog(0, "insert new leaf block %llu at %u\n", - (unsigned long long)new_bh->b_blocknr, new_cpos); + trace_ocfs2_new_leaf_refcount_block( + (unsigned long long)new_bh->b_blocknr, new_cpos); /* Insert the new leaf block with the specific offset cpos. */ ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, @@ -1794,11 +1794,10 @@ static int ocfs2_insert_refcount_rec(handle_t *handle, (le16_to_cpu(rf_list->rl_used) - index) * sizeof(struct ocfs2_refcount_rec)); - mlog(0, "insert refcount record start %llu, len %u, count %u " - "to leaf block %llu at index %d\n", - (unsigned long long)le64_to_cpu(rec->r_cpos), - le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount), - (unsigned long long)ref_leaf_bh->b_blocknr, index); + trace_ocfs2_insert_refcount_rec( + (unsigned long long)ref_leaf_bh->b_blocknr, index, + (unsigned long long)le64_to_cpu(rec->r_cpos), + le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount)); rf_list->rl_recs[index] = *rec; @@ -1850,10 +1849,12 @@ static int ocfs2_split_refcount_rec(handle_t *handle, BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); - mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n", - le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters), - le64_to_cpu(split_rec->r_cpos), - le32_to_cpu(split_rec->r_clusters)); + trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos), + le32_to_cpu(orig_rec->r_clusters), + le32_to_cpu(orig_rec->r_refcount), + le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters), + le32_to_cpu(split_rec->r_refcount)); /* * If we just need to split the header or tail clusters, @@ -1967,12 +1968,11 @@ static int ocfs2_split_refcount_rec(handle_t *handle, if (split_rec->r_refcount) { rf_list->rl_recs[index] = *split_rec; - mlog(0, "insert refcount record start %llu, len %u, count %u " - "to leaf block %llu at index %d\n", - (unsigned long long)le64_to_cpu(split_rec->r_cpos), - le32_to_cpu(split_rec->r_clusters), - le32_to_cpu(split_rec->r_refcount), - (unsigned long long)ref_leaf_bh->b_blocknr, index); + trace_ocfs2_split_refcount_rec_insert( + (unsigned long long)ref_leaf_bh->b_blocknr, index, + (unsigned long long)le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters), + le32_to_cpu(split_rec->r_refcount)); if (merge) ocfs2_refcount_rec_merge(rb, index); @@ -1997,7 +1997,7 @@ static int __ocfs2_increase_refcount(handle_t *handle, struct ocfs2_refcount_rec rec; unsigned int set_len = 0; - mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n", + trace_ocfs2_increase_refcount_begin( (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)cpos, len); @@ -2024,9 +2024,9 @@ static int __ocfs2_increase_refcount(handle_t *handle, */ if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && set_len <= len) { - mlog(0, "increase refcount rec, start %llu, len %u, " - "count %u\n", (unsigned long long)cpos, set_len, - le32_to_cpu(rec.r_refcount)); + trace_ocfs2_increase_refcount_change( + (unsigned long long)cpos, set_len, + le32_to_cpu(rec.r_refcount)); ret = ocfs2_change_refcount_rec(handle, ci, ref_leaf_bh, index, merge, 1); @@ -2037,7 +2037,7 @@ static int __ocfs2_increase_refcount(handle_t *handle, } else if (!rec.r_refcount) { rec.r_refcount = cpu_to_le32(1); - mlog(0, "insert refcount rec, start %llu, len %u\n", + trace_ocfs2_increase_refcount_insert( (unsigned long long)le64_to_cpu(rec.r_cpos), set_len); ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, @@ -2055,8 +2055,7 @@ static int __ocfs2_increase_refcount(handle_t *handle, rec.r_clusters = cpu_to_le32(set_len); le32_add_cpu(&rec.r_refcount, 1); - mlog(0, "split refcount rec, start %llu, " - "len %u, count %u\n", + trace_ocfs2_increase_refcount_split( (unsigned long long)le64_to_cpu(rec.r_cpos), set_len, le32_to_cpu(rec.r_refcount)); ret = ocfs2_split_refcount_rec(handle, ci, @@ -2095,6 +2094,11 @@ static int ocfs2_remove_refcount_extent(handle_t *handle, BUG_ON(rb->rf_records.rl_used); + trace_ocfs2_remove_refcount_extent( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)ref_leaf_bh->b_blocknr, + le32_to_cpu(rb->rf_cpos)); + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), 1, meta_ac, dealloc); @@ -2137,7 +2141,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle, if (!rb->rf_list.l_next_free_rec) { BUG_ON(rb->rf_clusters); - mlog(0, "reset refcount tree root %llu to be a record block.\n", + trace_ocfs2_restore_refcount_block( (unsigned long long)ref_root_bh->b_blocknr); rb->rf_flags = 0; @@ -2184,6 +2188,10 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle, BUG_ON(cpos + len > le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); + trace_ocfs2_decrease_refcount_rec( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len); + if (cpos == le64_to_cpu(rec->r_cpos) && len == le32_to_cpu(rec->r_clusters)) ret = ocfs2_change_refcount_rec(handle, ci, @@ -2195,12 +2203,6 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle, le32_add_cpu(&split.r_refcount, -1); - mlog(0, "split refcount rec, start %llu, " - "len %u, count %u, original start %llu, len %u\n", - (unsigned long long)le64_to_cpu(split.r_cpos), - len, le32_to_cpu(split.r_refcount), - (unsigned long long)le64_to_cpu(rec->r_cpos), - le32_to_cpu(rec->r_clusters)); ret = ocfs2_split_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, &split, index, 1, @@ -2239,10 +2241,9 @@ static int __ocfs2_decrease_refcount(handle_t *handle, struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct buffer_head *ref_leaf_bh = NULL; - mlog(0, "Tree owner %llu, decrease refcount start %llu, " - "len %u, delete %u\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)cpos, len, delete); + trace_ocfs2_decrease_refcount( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len, delete); while (len) { ret = ocfs2_get_refcount_rec(ci, ref_root_bh, @@ -2352,8 +2353,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, { int ret; - mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n", - inode->i_ino, cpos, len, phys); + trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno, + cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " @@ -2392,8 +2393,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; u32 len; - mlog(0, "start_cpos %llu, clusters %u\n", - (unsigned long long)start_cpos, clusters); while (clusters) { ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, clusters, &rec, @@ -2427,12 +2426,11 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; - mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu," - "rec->r_clusters %u, rec->r_refcount %u, index %d\n", - recs_add, (unsigned long long)cpos, clusters, - (unsigned long long)le64_to_cpu(rec.r_cpos), - le32_to_cpu(rec.r_clusters), - le32_to_cpu(rec.r_refcount), index); + trace_ocfs2_calc_refcount_meta_credits_iterate( + recs_add, (unsigned long long)cpos, clusters, + (unsigned long long)le64_to_cpu(rec.r_cpos), + le32_to_cpu(rec.r_clusters), + le32_to_cpu(rec.r_refcount), index); len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - cpos; @@ -2488,7 +2486,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, if (!ref_blocks) goto out; - mlog(0, "we need ref_blocks %d\n", ref_blocks); *meta_add += ref_blocks; *credits += ref_blocks; @@ -2514,6 +2511,10 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, } out: + + trace_ocfs2_calc_refcount_meta_credits( + (unsigned long long)start_cpos, clusters, + *meta_add, *credits); brelse(ref_leaf_bh); brelse(prev_bh); return ret; @@ -2578,8 +2579,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, goto out; } - mlog(0, "reserve new metadata %d blocks, credits = %d\n", - *ref_blocks, *credits); + trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits); out: brelse(ref_root_bh); @@ -2886,8 +2886,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb, goto out; } - mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n", - meta_add, num_clusters, *credits); + trace_ocfs2_lock_refcount_allocators(meta_add, *credits); ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, meta_ac); if (ret) { @@ -2937,8 +2936,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, loff_t offset, end, map_end; struct address_space *mapping = context->inode->i_mapping; - mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, - new_cluster, new_len, cpos); + trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, + new_cluster, new_len); readahead_pages = (ocfs2_cow_contig_clusters(sb) << @@ -3031,8 +3030,8 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, struct buffer_head *old_bh = NULL; struct buffer_head *new_bh = NULL; - mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster, - new_cluster, new_len); + trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, + new_cluster, new_len); for (i = 0; i < blocks; i++, old_block++, new_block++) { new_bh = sb_getblk(osb->sb, new_block); @@ -3085,8 +3084,8 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); u64 ino = ocfs2_metadata_cache_owner(et->et_ci); - mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n", - (unsigned long long)ino, cpos, len, p_cluster, ext_flags); + trace_ocfs2_clear_ext_refcount((unsigned long long)ino, + cpos, len, p_cluster, ext_flags); memset(&replace_rec, 0, sizeof(replace_rec)); replace_rec.e_cpos = cpu_to_le32(cpos); @@ -3141,8 +3140,8 @@ static int ocfs2_replace_clusters(handle_t *handle, struct ocfs2_caching_info *ci = context->data_et.et_ci; u64 ino = ocfs2_metadata_cache_owner(ci); - mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", - (unsigned long long)ino, cpos, old, new, len, ext_flags); + trace_ocfs2_replace_clusters((unsigned long long)ino, + cpos, old, new, len, ext_flags); /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { @@ -3236,8 +3235,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; struct ocfs2_refcount_rec rec; - mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n", - cpos, p_cluster, num_clusters, e_flags); + trace_ocfs2_make_clusters_writable(cpos, p_cluster, + num_clusters, e_flags); ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, &context->data_et, @@ -3475,9 +3474,9 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, goto out; } - mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " - "cow_len %u\n", inode->i_ino, - cpos, write_len, cow_start, cow_len); + trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno, + cpos, write_len, max_cpos, + cow_start, cow_len); BUG_ON(cow_len == 0); @@ -3756,8 +3755,7 @@ int ocfs2_add_refcount_flag(struct inode *inode, goto out; } - mlog(0, "reserve new metadata %d, credits = %d\n", - ref_blocks, credits); + trace_ocfs2_add_refcount_flag(ref_blocks, credits); if (ref_blocks) { ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 3e78db361bc..41ffd36c689 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -30,10 +30,10 @@ #include <linux/bitops.h> #include <linux/list.h> -#define MLOG_MASK_PREFIX ML_RESERVATIONS #include <cluster/masklog.h> #include "ocfs2.h" +#include "ocfs2_trace.h" #ifdef CONFIG_OCFS2_DEBUG_FS #define OCFS2_CHECK_RESERVATIONS @@ -321,8 +321,7 @@ static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap, assert_spin_locked(&resv_lock); - mlog(0, "Insert reservation start: %u len: %u\n", new->r_start, - new->r_len); + trace_ocfs2_resv_insert(new->r_start, new->r_len); while (*p) { parent = *p; @@ -423,8 +422,8 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, unsigned int best_start, best_len = 0; int offset, start, found; - mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n", - wanted, search_start, search_len, resmap->m_bitmap_len); + trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len, + wanted, resmap->m_bitmap_len); found = best_start = best_len = 0; @@ -463,7 +462,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, *rlen = best_len; *rstart = best_start; - mlog(0, "Found start: %u len: %u\n", best_start, best_len); + trace_ocfs2_resmap_find_free_bits_end(best_start, best_len); return *rlen; } @@ -487,9 +486,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, * - our window should be last in all reservations * - need to make sure we don't go past end of bitmap */ - - mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n", - resv->r_start, ocfs2_resv_end(resv), goal, wanted); + trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv), + goal, wanted, RB_EMPTY_ROOT(root)); assert_spin_locked(&resv_lock); @@ -498,9 +496,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, * Easiest case - empty tree. We can just take * whatever window of free bits we want. */ - - mlog(0, "Empty root\n"); - clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, resmap->m_bitmap_len - goal, &cstart, &clen); @@ -524,8 +519,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, prev_resv = ocfs2_find_resv_lhs(resmap, goal); if (prev_resv == NULL) { - mlog(0, "Goal on LHS of leftmost window\n"); - /* * A NULL here means that the search code couldn't * find a window that starts before goal. @@ -570,13 +563,15 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, next_resv = NULL; } + trace_ocfs2_resv_find_window_prev(prev_resv->r_start, + ocfs2_resv_end(prev_resv)); + prev = &prev_resv->r_node; /* Now we do a linear search for a window, starting at 'prev_rsv' */ while (1) { next = rb_next(prev); if (next) { - mlog(0, "One more resv found in linear search\n"); next_resv = rb_entry(next, struct ocfs2_alloc_reservation, r_node); @@ -585,7 +580,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, gap_end = next_resv->r_start - 1; gap_len = gap_end - gap_start + 1; } else { - mlog(0, "No next node\n"); /* * We're at the rightmost edge of the * tree. See if a reservation between this @@ -596,6 +590,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, gap_end = resmap->m_bitmap_len - 1; } + trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1, + next ? ocfs2_resv_end(next_resv) : -1); /* * No need to check this gap if we have already found * a larger region of free bits. @@ -654,8 +650,9 @@ static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap, lru_resv = list_first_entry(&resmap->m_lru, struct ocfs2_alloc_reservation, r_lru); - mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start, - lru_resv->r_len, ocfs2_resv_end(lru_resv)); + trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start, + lru_resv->r_len, + ocfs2_resv_end(lru_resv)); /* * Cannibalize (some or all) of the target reservation and @@ -684,10 +681,9 @@ static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap, resv->r_len = shrink; } - mlog(0, "Reservation now looks like: r_start: %u r_end: %u " - "r_len: %u r_last_start: %u r_last_len: %u\n", - resv->r_start, ocfs2_resv_end(resv), resv->r_len, - resv->r_last_start, resv->r_last_len); + trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv), + resv->r_len, resv->r_last_start, + resv->r_last_len); ocfs2_resv_insert(resmap, resv); } @@ -748,7 +744,6 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) wanted = *clen; - mlog(0, "empty reservation, find new window\n"); /* * Try to get a window here. If it works, we must fall * through and test the bitmap . This avoids some @@ -757,6 +752,7 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, * that inode. */ ocfs2_resv_find_window(resmap, resv, wanted); + trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len); } BUG_ON(ocfs2_resv_empty(resv)); @@ -813,10 +809,10 @@ void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, spin_lock(&resv_lock); - mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u " - "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n", - cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv), - resv->r_len, resv->r_last_start, resv->r_last_len); + trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start, + ocfs2_resv_end(resv), resv->r_len, + resv->r_last_start, + resv->r_last_len); BUG_ON(cstart < resv->r_start); BUG_ON(cstart > ocfs2_resv_end(resv)); @@ -833,10 +829,9 @@ void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, if (!ocfs2_resv_empty(resv)) ocfs2_resv_mark_lru(resmap, resv); - mlog(0, "Reservation now looks like: r_start: %u r_end: %u " - "r_len: %u r_last_start: %u r_last_len: %u\n", - resv->r_start, ocfs2_resv_end(resv), resv->r_len, - resv->r_last_start, resv->r_last_len); + trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv), + resv->r_len, resv->r_last_start, + resv->r_last_len); ocfs2_check_resmap(resmap); diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index dacd553d861..ec55add7604 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -27,7 +27,6 @@ #include <linux/fs.h> #include <linux/types.h> -#define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> #include "ocfs2.h" @@ -39,6 +38,7 @@ #include "super.h" #include "sysfile.h" #include "uptodate.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" #include "suballoc.h" @@ -82,7 +82,6 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode, backups++; } - mlog_exit_void(); return backups; } @@ -103,8 +102,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, u16 cl_bpc = le16_to_cpu(cl->cl_bpc); u16 cl_cpg = le16_to_cpu(cl->cl_cpg); - mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", - new_clusters, first_new_cluster); + trace_ocfs2_update_last_group_and_inode(new_clusters, + first_new_cluster); ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode), group_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -176,7 +175,8 @@ out_rollback: le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); } out: - mlog_exit(ret); + if (ret) + mlog_errno(ret); return ret; } @@ -281,8 +281,6 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) u32 first_new_cluster; u64 lgd_blkno; - mlog_entry_void(); - if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; @@ -342,7 +340,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out_unlock; } - mlog(0, "extend the last group at %llu, new clusters = %d\n", + + trace_ocfs2_group_extend( (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters); handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS); @@ -377,7 +376,6 @@ out_mutex: iput(main_bm_inode); out: - mlog_exit_void(); return ret; } @@ -472,8 +470,6 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) struct ocfs2_chain_rec *cr; u16 cl_bpc; - mlog_entry_void(); - if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; @@ -520,8 +516,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_unlock; } - mlog(0, "Add a new group %llu in chain = %u, length = %u\n", - (unsigned long long)input->group, input->chain, input->clusters); + trace_ocfs2_group_add((unsigned long long)input->group, + input->chain, input->clusters, input->frees); handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS); if (IS_ERR(handle)) { @@ -589,6 +585,5 @@ out_mutex: iput(main_bm_inode); out: - mlog_exit_void(); return ret; } diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index ab4e0172cc1..26fc0014d50 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -27,7 +27,6 @@ #include <linux/slab.h> #include <linux/highmem.h> -#define MLOG_MASK_PREFIX ML_SUPER #include <cluster/masklog.h> #include "ocfs2.h" @@ -39,6 +38,7 @@ #include "slot_map.h" #include "super.h" #include "sysfile.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -142,8 +142,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) BUG_ON(si->si_blocks == 0); BUG_ON(si->si_bh == NULL); - mlog(0, "Refreshing slot map, reading %u block(s)\n", - si->si_blocks); + trace_ocfs2_refresh_slot_info(si->si_blocks); /* * We pass -1 as blocknr because we expect all of si->si_bh to @@ -381,8 +380,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, /* The size checks above should ensure this */ BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks); - mlog(0, "Slot map needs %u buffers for %llu bytes\n", - si->si_blocks, bytes); + trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, GFP_KERNEL); @@ -400,8 +398,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, goto bail; } - mlog(0, "Reading slot map block %u at %llu\n", i, - (unsigned long long)blkno); + trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i); bh = NULL; /* Acquire a fresh bh */ status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno, @@ -475,8 +472,6 @@ int ocfs2_find_slot(struct ocfs2_super *osb) int slot; struct ocfs2_slot_info *si; - mlog_entry_void(); - si = osb->slot_info; spin_lock(&osb->osb_lock); @@ -505,14 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb) osb->slot_num = slot; spin_unlock(&osb->osb_lock); - mlog(0, "taking node slot %d\n", osb->slot_num); + trace_ocfs2_find_slot(osb->slot_num); status = ocfs2_update_disk_slot(osb, si, osb->slot_num); if (status < 0) mlog_errno(status); bail: - mlog_exit(status); return status; } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 71998d4d61d..ab6e2061074 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -29,7 +29,6 @@ #include <linux/slab.h> #include <linux/highmem.h> -#define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> #include "ocfs2.h" @@ -44,6 +43,7 @@ #include "super.h" #include "sysfile.h" #include "uptodate.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -308,8 +308,8 @@ static int ocfs2_validate_group_descriptor(struct super_block *sb, int rc; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; - mlog(0, "Validating group descriptor %llu\n", - (unsigned long long)bh->b_blocknr); + trace_ocfs2_validate_group_descriptor( + (unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); @@ -389,8 +389,6 @@ static int ocfs2_block_group_fill(handle_t *handle, struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct super_block * sb = alloc_inode->i_sb; - mlog_entry_void(); - if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " "b_blocknr (%llu)", @@ -436,7 +434,8 @@ static int ocfs2_block_group_fill(handle_t *handle, * allocation time. */ bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -477,8 +476,8 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, /* setup the group */ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "new descriptor, record %u, at block %llu\n", - alloc_rec, (unsigned long long)bg_blkno); + trace_ocfs2_block_group_alloc_contig( + (unsigned long long)bg_blkno, alloc_rec); bg_bh = sb_getblk(osb->sb, bg_blkno); if (!bg_bh) { @@ -657,8 +656,8 @@ ocfs2_block_group_alloc_discontig(handle_t *handle, /* setup the group */ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "new descriptor, record %u, at block %llu\n", - alloc_rec, (unsigned long long)bg_blkno); + trace_ocfs2_block_group_alloc_discontig( + (unsigned long long)bg_blkno, alloc_rec); bg_bh = sb_getblk(osb->sb, bg_blkno); if (!bg_bh) { @@ -707,8 +706,6 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); - mlog_entry_void(); - cl = &fe->id2.i_chain; status = ocfs2_reserve_clusters_with_limit(osb, le16_to_cpu(cl->cl_cpg), @@ -730,8 +727,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, } if (last_alloc_group && *last_alloc_group != 0) { - mlog(0, "use old allocation group %llu for block group alloc\n", - (unsigned long long)*last_alloc_group); + trace_ocfs2_block_group_alloc( + (unsigned long long)*last_alloc_group); ac->ac_last_group = *last_alloc_group; } @@ -796,7 +793,8 @@ bail: brelse(bg_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -814,8 +812,6 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_dinode *fe; u32 free_bits; - mlog_entry_void(); - alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); if (!alloc_inode) { mlog_errno(-EINVAL); @@ -855,16 +851,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, if (bits_wanted > free_bits) { /* cluster bitmap never grows */ if (ocfs2_is_cluster_bitmap(alloc_inode)) { - mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", - bits_wanted, free_bits); + trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, + free_bits); status = -ENOSPC; goto bail; } if (!(flags & ALLOC_NEW_GROUP)) { - mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " - "and we don't alloc a new group for it.\n", - slot, bits_wanted, free_bits); + trace_ocfs2_reserve_suballoc_bits_no_new_group( + slot, bits_wanted, free_bits); status = -ENOSPC; goto bail; } @@ -890,7 +885,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, bail: brelse(bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1052,7 +1048,8 @@ bail: *ac = NULL; } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1119,8 +1116,8 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, spin_lock(&osb->osb_lock); osb->osb_inode_alloc_group = alloc_group; spin_unlock(&osb->osb_lock); - mlog(0, "after reservation, new allocation group is " - "%llu\n", (unsigned long long)alloc_group); + trace_ocfs2_reserve_new_inode_new_group( + (unsigned long long)alloc_group); /* * Some inodes must be freed by us, so try to allocate @@ -1152,7 +1149,8 @@ bail: *ac = NULL; } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1189,8 +1187,6 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, { int status; - mlog_entry_void(); - *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; @@ -1229,7 +1225,8 @@ bail: *ac = NULL; } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1357,15 +1354,12 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, void *bitmap = bg->bg_bitmap; int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; - mlog_entry_void(); - /* All callers get the descriptor via * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); - mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, - num_bits); + trace_ocfs2_block_group_set_bits(bit_off, num_bits); if (ocfs2_is_cluster_bitmap(alloc_inode)) journal_type = OCFS2_JOURNAL_ACCESS_UNDO; @@ -1394,7 +1388,8 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, ocfs2_journal_dirty(handle, group_bh); bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1437,10 +1432,10 @@ static int ocfs2_relink_block_group(handle_t *handle, BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); - mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", - (unsigned long long)le64_to_cpu(fe->i_blkno), chain, - (unsigned long long)le64_to_cpu(bg->bg_blkno), - (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); + trace_ocfs2_relink_block_group( + (unsigned long long)le64_to_cpu(fe->i_blkno), chain, + (unsigned long long)le64_to_cpu(bg->bg_blkno), + (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); bg_ptr = le64_to_cpu(bg->bg_next_group); @@ -1484,7 +1479,8 @@ out_rollback: prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1525,10 +1521,10 @@ static int ocfs2_cluster_group_search(struct inode *inode, if ((gd_cluster_off + max_bits) > OCFS2_I(inode)->ip_clusters) { max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; - mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - OCFS2_I(inode)->ip_clusters, max_bits); + trace_ocfs2_cluster_group_search_wrong_max_bits( + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + OCFS2_I(inode)->ip_clusters, max_bits); } ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), @@ -1542,9 +1538,9 @@ static int ocfs2_cluster_group_search(struct inode *inode, gd_cluster_off + res->sr_bit_offset + res->sr_bits); - mlog(0, "Checking %llu against %llu\n", - (unsigned long long)blkoff, - (unsigned long long)max_block); + trace_ocfs2_cluster_group_search_max_block( + (unsigned long long)blkoff, + (unsigned long long)max_block); if (blkoff > max_block) return -ENOSPC; } @@ -1588,9 +1584,9 @@ static int ocfs2_block_group_search(struct inode *inode, if (!ret && max_block) { blkoff = le64_to_cpu(bg->bg_blkno) + res->sr_bit_offset + res->sr_bits; - mlog(0, "Checking %llu against %llu\n", - (unsigned long long)blkoff, - (unsigned long long)max_block); + trace_ocfs2_block_group_search_max_block( + (unsigned long long)blkoff, + (unsigned long long)max_block); if (blkoff > max_block) ret = -ENOSPC; } @@ -1756,9 +1752,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, struct ocfs2_group_desc *bg; chain = ac->ac_chain; - mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n", - bits_wanted, chain, - (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); + trace_ocfs2_search_chain_begin( + (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, + bits_wanted, chain); status = ocfs2_read_group_descriptor(alloc_inode, fe, le64_to_cpu(cl->cl_recs[chain].c_blkno), @@ -1799,8 +1795,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, goto bail; } - mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", - res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); + trace_ocfs2_search_chain_succ( + (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); @@ -1861,8 +1857,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, goto bail; } - mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, - (unsigned long long)le64_to_cpu(fe->i_blkno)); + trace_ocfs2_search_chain_end( + (unsigned long long)le64_to_cpu(fe->i_blkno), + res->sr_bits); out_loc_only: *bits_left = le16_to_cpu(bg->bg_free_bits_count); @@ -1870,7 +1867,8 @@ bail: brelse(group_bh); brelse(prev_group_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1888,8 +1886,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; - mlog_entry_void(); - BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); BUG_ON(!ac->ac_bh); @@ -1945,8 +1941,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, goto bail; } - mlog(0, "Search of victim chain %u came up with nothing, " - "trying all chains now.\n", victim); + trace_ocfs2_claim_suballoc_bits(victim); /* If we didn't pick a good victim, then just default to * searching each chain in order. Don't allow chain relinking @@ -1984,7 +1979,8 @@ set_hint: } bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2021,7 +2017,8 @@ int ocfs2_claim_metadata(handle_t *handle, *num_bits = res.sr_bits; status = 0; bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2172,8 +2169,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, goto out; } - mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, - (unsigned long long)di_blkno); + trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, + res->sr_bits); atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); @@ -2201,8 +2198,6 @@ int ocfs2_claim_new_inode(handle_t *handle, int status; struct ocfs2_suballoc_result res; - mlog_entry_void(); - BUG_ON(!ac); BUG_ON(ac->ac_bits_given != 0); BUG_ON(ac->ac_bits_wanted != 1); @@ -2230,7 +2225,8 @@ int ocfs2_claim_new_inode(handle_t *handle, ocfs2_save_inode_ac_group(dir, ac); status = 0; bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2307,8 +2303,6 @@ int __ocfs2_claim_clusters(handle_t *handle, struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); - mlog_entry_void(); - BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL @@ -2363,7 +2357,8 @@ int __ocfs2_claim_clusters(handle_t *handle, ac->ac_bits_given += *num_clusters; bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2392,13 +2387,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, unsigned int tmp; struct ocfs2_group_desc *undo_bg = NULL; - mlog_entry_void(); - /* The caller got this descriptor from * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); - mlog(0, "off = %u, num = %u\n", bit_off, num_bits); + trace_ocfs2_block_group_clear_bits(bit_off, num_bits); BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), @@ -2463,8 +2456,6 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *group; - mlog_entry_void(); - /* The alloc_bh comes from ocfs2_free_dinode() or * ocfs2_free_clusters(). The callers have all locked the * allocator and gotten alloc_bh from the lock call. This @@ -2473,9 +2464,10 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); - mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", - (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, - (unsigned long long)bg_blkno, start_bit); + trace_ocfs2_free_suballoc_bits( + (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, + (unsigned long long)bg_blkno, + start_bit, count); status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, &group_bh); @@ -2511,7 +2503,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, bail: brelse(group_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2556,11 +2549,8 @@ static int _ocfs2_free_clusters(handle_t *handle, /* You can't ever have a contiguous set of clusters * bigger than a block group bitmap so we never have to worry - * about looping on them. */ - - mlog_entry_void(); - - /* This is expensive. We can safely remove once this stuff has + * about looping on them. + * This is expensive. We can safely remove once this stuff has * gotten tested really well. */ BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); @@ -2569,10 +2559,9 @@ static int _ocfs2_free_clusters(handle_t *handle, ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, &bg_start_bit); - mlog(0, "want to free %u clusters starting at block %llu\n", - num_clusters, (unsigned long long)start_blk); - mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", - (unsigned long long)bg_blkno, bg_start_bit); + trace_ocfs2_free_clusters((unsigned long long)bg_blkno, + (unsigned long long)start_blk, + bg_start_bit, num_clusters); status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, bg_start_bit, bg_blkno, @@ -2586,7 +2575,8 @@ static int _ocfs2_free_clusters(handle_t *handle, num_clusters); out: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2756,7 +2746,7 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, struct buffer_head *inode_bh = NULL; struct ocfs2_dinode *inode_fe; - mlog_entry("blkno: %llu\n", (unsigned long long)blkno); + trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); /* dirty read disk */ status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); @@ -2793,7 +2783,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, bail: brelse(inode_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2816,8 +2807,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, u64 bg_blkno; int status; - mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, - (unsigned int)bit); + trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, + (unsigned int)bit); alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { @@ -2844,7 +2835,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, bail: brelse(group_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2869,7 +2861,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) struct inode *inode_alloc_inode; struct buffer_head *alloc_bh = NULL; - mlog_entry("blkno: %llu", (unsigned long long)blkno); + trace_ocfs2_test_inode_bit((unsigned long long)blkno); status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, &group_blkno, &suballoc_bit); @@ -2910,6 +2902,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) iput(inode_alloc_inode); brelse(alloc_bh); bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 236ed1bdca2..69fa11b35aa 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -42,7 +42,9 @@ #include <linux/seq_file.h> #include <linux/quotaops.h> -#define MLOG_MASK_PREFIX ML_SUPER +#define CREATE_TRACE_POINTS +#include "ocfs2_trace.h" + #include <cluster/masklog.h> #include "ocfs2.h" @@ -441,8 +443,6 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) int status = 0; int i; - mlog_entry_void(); - new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0); if (IS_ERR(new)) { status = PTR_ERR(new); @@ -478,7 +478,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) } bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -488,8 +489,6 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) int status = 0; int i; - mlog_entry_void(); - for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; i < NUM_SYSTEM_INODES; i++) { @@ -508,7 +507,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) } bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -517,8 +517,6 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb) int i; struct inode *inode; - mlog_entry_void(); - for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { inode = osb->global_system_inodes[i]; if (inode) { @@ -540,7 +538,7 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb) } if (!osb->local_system_inodes) - goto out; + return; for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { if (osb->local_system_inodes[i]) { @@ -551,9 +549,6 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb) kfree(osb->local_system_inodes); osb->local_system_inodes = NULL; - -out: - mlog_exit(0); } /* We're allocating fs objects, use GFP_NOFS */ @@ -684,12 +679,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) } if (*flags & MS_RDONLY) { - mlog(0, "Going to ro mode.\n"); sb->s_flags |= MS_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; } else { - mlog(0, "Making ro filesystem writeable.\n"); - if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { mlog(ML_ERROR, "Cannot remount RDWR " "filesystem due to previous errors.\n"); @@ -707,6 +699,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; } + trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); unlock_osb: spin_unlock(&osb->osb_lock); /* Enable quota accounting after remounting RW */ @@ -1032,7 +1025,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) char nodestr[8]; struct ocfs2_blockcheck_stats stats; - mlog_entry("%p, %p, %i", sb, data, silent); + trace_ocfs2_fill_super(sb, data, silent); if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { status = -EINVAL; @@ -1208,7 +1201,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) mlog_errno(status); atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); - mlog_exit(status); return status; } } @@ -1222,7 +1214,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); - mlog_exit(status); return status; read_super_error: @@ -1237,7 +1228,8 @@ read_super_error: ocfs2_dismount_volume(sb, 1); } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1320,8 +1312,7 @@ static int ocfs2_parse_options(struct super_block *sb, char *p; u32 tmp; - mlog_entry("remount: %d, options: \"%s\"\n", is_remount, - options ? options : "(none)"); + trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); mopt->commit_interval = 0; mopt->mount_opt = OCFS2_MOUNT_NOINTR; @@ -1538,7 +1529,6 @@ static int ocfs2_parse_options(struct super_block *sb, status = 1; bail: - mlog_exit(status); return status; } @@ -1629,8 +1619,6 @@ static int __init ocfs2_init(void) { int status; - mlog_entry_void(); - ocfs2_print_version(); status = init_ocfs2_uptodate_cache(); @@ -1664,10 +1652,9 @@ leave: if (status < 0) { ocfs2_free_mem_caches(); exit_ocfs2_uptodate_cache(); + mlog_errno(status); } - mlog_exit(status); - if (status >= 0) { return register_filesystem(&ocfs2_fs_type); } else @@ -1676,8 +1663,6 @@ leave: static void __exit ocfs2_exit(void) { - mlog_entry_void(); - if (ocfs2_wq) { flush_workqueue(ocfs2_wq); destroy_workqueue(ocfs2_wq); @@ -1692,18 +1677,14 @@ static void __exit ocfs2_exit(void) unregister_filesystem(&ocfs2_fs_type); exit_ocfs2_uptodate_cache(); - - mlog_exit_void(); } static void ocfs2_put_super(struct super_block *sb) { - mlog_entry("(0x%p)\n", sb); + trace_ocfs2_put_super(sb); ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); - - mlog_exit_void(); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1715,7 +1696,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) struct buffer_head *bh = NULL; struct inode *inode = NULL; - mlog_entry("(%p, %p)\n", dentry->d_sb, buf); + trace_ocfs2_statfs(dentry->d_sb, buf); osb = OCFS2_SB(dentry->d_sb); @@ -1762,7 +1743,8 @@ bail: if (inode) iput(inode); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1882,8 +1864,6 @@ static int ocfs2_mount_volume(struct super_block *sb) int unlock_super = 0; struct ocfs2_super *osb = OCFS2_SB(sb); - mlog_entry_void(); - if (ocfs2_is_hard_readonly(osb)) goto leave; @@ -1928,7 +1908,6 @@ leave: if (unlock_super) ocfs2_super_unlock(osb, 1); - mlog_exit(status); return status; } @@ -1938,7 +1917,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) struct ocfs2_super *osb = NULL; char nodestr[8]; - mlog_entry("(0x%p)\n", sb); + trace_ocfs2_dismount_volume(sb); BUG_ON(!sb); osb = OCFS2_SB(sb); @@ -2090,8 +2069,6 @@ static int ocfs2_initialize_super(struct super_block *sb, struct ocfs2_super *osb; u64 total_blocks; - mlog_entry_void(); - osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); if (!osb) { status = -ENOMEM; @@ -2155,7 +2132,6 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } - mlog(0, "max_slots for this device: %u\n", osb->max_slots); ocfs2_orphan_scan_init(osb); @@ -2294,7 +2270,6 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->s_clustersize_bits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); osb->s_clustersize = 1 << osb->s_clustersize_bits; - mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits); if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { @@ -2333,11 +2308,10 @@ static int ocfs2_initialize_super(struct super_block *sb, le64_to_cpu(di->id2.i_super.s_first_cluster_group); osb->fs_generation = le32_to_cpu(di->i_fs_generation); osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); - mlog(0, "vol_label: %s\n", osb->vol_label); - mlog(0, "uuid: %s\n", osb->uuid_str); - mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n", - (unsigned long long)osb->root_blkno, - (unsigned long long)osb->system_dir_blkno); + trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str, + (unsigned long long)osb->root_blkno, + (unsigned long long)osb->system_dir_blkno, + osb->s_clustersize_bits); osb->osb_dlm_debug = ocfs2_new_dlm_debug(); if (!osb->osb_dlm_debug) { @@ -2380,7 +2354,6 @@ static int ocfs2_initialize_super(struct super_block *sb, } bail: - mlog_exit(status); return status; } @@ -2396,8 +2369,6 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, { int status = -EAGAIN; - mlog_entry_void(); - if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { /* We have to do a raw check of the feature here */ @@ -2452,7 +2423,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, } out: - mlog_exit(status); + if (status && status != -EAGAIN) + mlog_errno(status); return status; } @@ -2465,8 +2437,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) * recover * ourselves. */ - mlog_entry_void(); - /* Init our journal object. */ status = ocfs2_journal_init(osb->journal, &dirty); if (status < 0) { @@ -2516,8 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) * ourselves as mounted. */ } - mlog(0, "Journal loaded.\n"); - status = ocfs2_load_local_alloc(osb); if (status < 0) { mlog_errno(status); @@ -2549,7 +2517,8 @@ finally: if (local_alloc) kfree(local_alloc); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -2561,8 +2530,6 @@ finally: */ static void ocfs2_delete_osb(struct ocfs2_super *osb) { - mlog_entry_void(); - /* This function assumes that the caller has the main osb resource */ ocfs2_free_slot_info(osb); @@ -2580,8 +2547,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->uuid_str); ocfs2_put_dlm_debug(osb->osb_dlm_debug); memset(osb, 0, sizeof(struct ocfs2_super)); - - mlog_exit_void(); } /* Put OCFS2 into a readonly state, or (if the user specifies it), diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 9975457c981..5d22872e2bb 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -40,7 +40,6 @@ #include <linux/pagemap.h> #include <linux/namei.h> -#define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> #include "ocfs2.h" @@ -62,8 +61,6 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode, char *link = NULL; struct ocfs2_dinode *fe; - mlog_entry_void(); - status = ocfs2_read_inode_block(inode, bh); if (status < 0) { mlog_errno(status); @@ -74,7 +71,6 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode, fe = (struct ocfs2_dinode *) (*bh)->b_data; link = (char *) fe->id2.i_symlink; bail: - mlog_exit(status); return link; } @@ -88,8 +84,6 @@ static int ocfs2_readlink(struct dentry *dentry, struct buffer_head *bh = NULL; struct inode *inode = dentry->d_inode; - mlog_entry_void(); - link = ocfs2_fast_symlink_getlink(inode, &bh); if (IS_ERR(link)) { ret = PTR_ERR(link); @@ -104,7 +98,8 @@ static int ocfs2_readlink(struct dentry *dentry, brelse(bh); out: - mlog_exit(ret); + if (ret < 0) + mlog_errno(ret); return ret; } @@ -117,8 +112,6 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry, struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; - mlog_entry_void(); - BUG_ON(!ocfs2_inode_is_fast_symlink(inode)); target = ocfs2_fast_symlink_getlink(inode, &bh); if (IS_ERR(target)) { @@ -142,7 +135,8 @@ bail: nd_set_link(nd, status ? ERR_PTR(status) : link); brelse(bh); - mlog_exit(status); + if (status) + mlog_errno(status); return NULL; } diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 902efb23b6a..3d635f4bbb2 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -27,7 +27,6 @@ #include <linux/types.h> #include <linux/highmem.h> -#define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> #include "ocfs2.h" diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index a0a120e82b9..52eaf33d346 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -54,14 +54,13 @@ #include <linux/buffer_head.h> #include <linux/rbtree.h> -#define MLOG_MASK_PREFIX ML_UPTODATE - #include <cluster/masklog.h> #include "ocfs2.h" #include "inode.h" #include "uptodate.h" +#include "ocfs2_trace.h" struct ocfs2_meta_cache_item { struct rb_node c_node; @@ -152,8 +151,8 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) while ((node = rb_last(root)) != NULL) { item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); - mlog(0, "Purge item %llu\n", - (unsigned long long) item->c_block); + trace_ocfs2_purge_copied_metadata_tree( + (unsigned long long) item->c_block); rb_erase(&item->c_node, root); kmem_cache_free(ocfs2_uptodate_cachep, item); @@ -180,9 +179,9 @@ void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci) tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE); to_purge = ci->ci_num_cached; - mlog(0, "Purge %u %s items from Owner %llu\n", to_purge, - tree ? "array" : "tree", - (unsigned long long)ocfs2_metadata_cache_owner(ci)); + trace_ocfs2_metadata_cache_purge( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + to_purge, tree); /* If we're a tree, save off the root so that we can safely * initialize the cache. We do the work to free tree members @@ -249,10 +248,10 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci, ocfs2_metadata_cache_lock(ci); - mlog(0, "Owner %llu, query block %llu (inline = %u)\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long) bh->b_blocknr, - !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE)); + trace_ocfs2_buffer_cached_begin( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long) bh->b_blocknr, + !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE)); if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) index = ocfs2_search_cache_array(ci, bh->b_blocknr); @@ -261,7 +260,7 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci, ocfs2_metadata_cache_unlock(ci); - mlog(0, "index = %d, item = %p\n", index, item); + trace_ocfs2_buffer_cached_end(index, item); return (index != -1) || (item != NULL); } @@ -306,8 +305,9 @@ static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, { BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY); - mlog(0, "block %llu takes position %u\n", (unsigned long long) block, - ci->ci_num_cached); + trace_ocfs2_append_cache_array( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)block, ci->ci_num_cached); ci->ci_cache.ci_array[ci->ci_num_cached] = block; ci->ci_num_cached++; @@ -324,8 +324,9 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; struct ocfs2_meta_cache_item *tmp; - mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block, - ci->ci_num_cached); + trace_ocfs2_insert_cache_tree( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)block, ci->ci_num_cached); while(*p) { parent = *p; @@ -389,9 +390,9 @@ static void ocfs2_expand_cache(struct ocfs2_caching_info *ci, tree[i] = NULL; } - mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - ci->ci_flags, ci->ci_num_cached); + trace_ocfs2_expand_cache( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + ci->ci_flags, ci->ci_num_cached); } /* Slow path function - memory allocation is necessary. See the @@ -405,9 +406,9 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] = { NULL, }; - mlog(0, "Owner %llu, block %llu, expand = %d\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)block, expand_tree); + trace_ocfs2_set_buffer_uptodate( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)block, expand_tree); new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); if (!new) { @@ -433,7 +434,6 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, ocfs2_metadata_cache_lock(ci); if (ocfs2_insert_can_use_array(ci)) { - mlog(0, "Someone cleared the tree underneath us\n"); /* Ok, items were removed from the cache in between * locks. Detect this and revert back to the fast path */ ocfs2_append_cache_array(ci, block); @@ -490,9 +490,9 @@ void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, if (ocfs2_buffer_cached(ci, bh)) return; - mlog(0, "Owner %llu, inserting block %llu\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)bh->b_blocknr); + trace_ocfs2_set_buffer_uptodate_begin( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)bh->b_blocknr); /* No need to recheck under spinlock - insertion is guarded by * co_io_lock() */ @@ -542,8 +542,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, BUG_ON(index >= ci->ci_num_cached); BUG_ON(!ci->ci_num_cached); - mlog(0, "remove index %d (num_cached = %u\n", index, - ci->ci_num_cached); + trace_ocfs2_remove_metadata_array( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + index, ci->ci_num_cached); ci->ci_num_cached--; @@ -559,8 +560,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, struct ocfs2_meta_cache_item *item) { - mlog(0, "remove block %llu from tree\n", - (unsigned long long) item->c_block); + trace_ocfs2_remove_metadata_tree( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)item->c_block); rb_erase(&item->c_node, &ci->ci_cache.ci_tree); ci->ci_num_cached--; @@ -573,10 +575,10 @@ static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci, struct ocfs2_meta_cache_item *item = NULL; ocfs2_metadata_cache_lock(ci); - mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long) block, ci->ci_num_cached, - ci->ci_flags & OCFS2_CACHE_FL_INLINE); + trace_ocfs2_remove_block_from_cache( + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long) block, ci->ci_num_cached, + ci->ci_flags); if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { index = ocfs2_search_cache_array(ci, block); @@ -626,9 +628,6 @@ int __init init_ocfs2_uptodate_cache(void) if (!ocfs2_uptodate_cachep) return -ENOMEM; - mlog(0, "%u inlined cache items per inode.\n", - OCFS2_CACHE_INFO_MAX_ARRAY); - return 0; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 6bb602486c6..57a215dc2d9 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -37,7 +37,6 @@ #include <linux/string.h> #include <linux/security.h> -#define MLOG_MASK_PREFIX ML_XATTR #include <cluster/masklog.h> #include "ocfs2.h" @@ -57,6 +56,7 @@ #include "xattr.h" #include "refcounttree.h" #include "acl.h" +#include "ocfs2_trace.h" struct ocfs2_xattr_def_value_root { struct ocfs2_xattr_value_root xv; @@ -474,8 +474,7 @@ static int ocfs2_validate_xattr_block(struct super_block *sb, struct ocfs2_xattr_block *xb = (struct ocfs2_xattr_block *)bh->b_data; - mlog(0, "Validating xattr block %llu\n", - (unsigned long long)bh->b_blocknr); + trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); @@ -715,11 +714,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); struct ocfs2_extent_tree et; - mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); - ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); while (clusters_to_add) { + trace_ocfs2_xattr_extend_allocation(clusters_to_add); + status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -754,8 +753,6 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, */ BUG_ON(why == RESTART_META); - mlog(0, "restarting xattr value extension for %u" - " clusters,.\n", clusters_to_add); credits = ocfs2_calc_extend_credits(inode->i_sb, &vb->vb_xv->xr_list, clusters_to_add); @@ -3246,8 +3243,8 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode, } meta_add += extra_meta; - mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " - "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits); + trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add, + clusters_add, *credits); if (meta_add) { ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, @@ -3887,8 +3884,10 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, if (found) { xs->here = &xs->header->xh_entries[index]; - mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, - (unsigned long long)bucket_blkno(xs->bucket), index); + trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno, + name, name_index, name_hash, + (unsigned long long)bucket_blkno(xs->bucket), + index); } else ret = -ENODATA; @@ -3915,8 +3914,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode, if (le16_to_cpu(el->l_next_free_rec) == 0) return -ENODATA; - mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n", - name, name_hash, name_index); + trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno, + name, name_index, name_hash, + (unsigned long long)root_bh->b_blocknr, + -1); ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash, &num_clusters, el); @@ -3927,9 +3928,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode, BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash); - mlog(0, "find xattr extent rec %u clusters from %llu, the first hash " - "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno, - first_hash); + trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno, + name, name_index, first_hash, + (unsigned long long)p_blkno, + num_clusters); ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash, p_blkno, first_hash, num_clusters, xs); @@ -3955,8 +3957,9 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode, return -ENOMEM; } - mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", - clusters, (unsigned long long)blkno); + trace_ocfs2_iterate_xattr_buckets( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)blkno, clusters); for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) { ret = ocfs2_read_xattr_bucket(bucket, blkno); @@ -3972,8 +3975,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode, if (i == 0) num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets); - mlog(0, "iterating xattr bucket %llu, first hash %u\n", - (unsigned long long)blkno, + trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno, le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash)); if (func) { ret = func(inode, bucket, para); @@ -4173,9 +4175,9 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, char *src = xb_bh->b_data; char *target = bucket_block(bucket, blks - 1); - mlog(0, "cp xattr from block %llu to bucket %llu\n", - (unsigned long long)xb_bh->b_blocknr, - (unsigned long long)bucket_blkno(bucket)); + trace_ocfs2_cp_xattr_block_to_bucket_begin( + (unsigned long long)xb_bh->b_blocknr, + (unsigned long long)bucket_blkno(bucket)); for (i = 0; i < blks; i++) memset(bucket_block(bucket, i), 0, blocksize); @@ -4211,8 +4213,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, for (i = 0; i < count; i++) le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change); - mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n", - offset, size, off_change); + trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change); sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), cmp_xe, swap_xe); @@ -4261,8 +4262,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, struct ocfs2_xattr_tree_root *xr; u16 xb_flags = le16_to_cpu(xb->xb_flags); - mlog(0, "create xattr index block for %llu\n", - (unsigned long long)xb_bh->b_blocknr); + trace_ocfs2_xattr_create_index_block_begin( + (unsigned long long)xb_bh->b_blocknr); BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); BUG_ON(!xs->bucket); @@ -4295,8 +4296,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, */ blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); - mlog(0, "allocate 1 cluster from %llu to xattr block\n", - (unsigned long long)blkno); + trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); if (ret) { @@ -4400,8 +4400,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, entries = (char *)xh->xh_entries; xh_free_start = le16_to_cpu(xh->xh_free_start); - mlog(0, "adjust xattr bucket in %llu, count = %u, " - "xh_free_start = %u, xh_name_value_len = %u.\n", + trace_ocfs2_defrag_xattr_bucket( (unsigned long long)blkno, le16_to_cpu(xh->xh_count), xh_free_start, le16_to_cpu(xh->xh_name_value_len)); @@ -4503,8 +4502,9 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets); BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize); - mlog(0, "move half of xattrs in cluster %llu to %llu\n", - (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno); + trace_ocfs2_mv_xattr_bucket_cross_cluster( + (unsigned long long)last_cluster_blkno, + (unsigned long long)new_blkno); ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first), last_cluster_blkno, new_blkno, @@ -4614,8 +4614,8 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, struct ocfs2_xattr_entry *xe; int blocksize = inode->i_sb->s_blocksize; - mlog(0, "move some of xattrs from bucket %llu to %llu\n", - (unsigned long long)blk, (unsigned long long)new_blk); + trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk, + (unsigned long long)new_blk); s_bucket = ocfs2_xattr_bucket_new(inode); t_bucket = ocfs2_xattr_bucket_new(inode); @@ -4714,9 +4714,9 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, */ xe = &xh->xh_entries[start]; len = sizeof(struct ocfs2_xattr_entry) * (count - start); - mlog(0, "mv xattr entry len %d from %d to %d\n", len, - (int)((char *)xe - (char *)xh), - (int)((char *)xh->xh_entries - (char *)xh)); + trace_ocfs2_divide_xattr_bucket_move(len, + (int)((char *)xe - (char *)xh), + (int)((char *)xh->xh_entries - (char *)xh)); memmove((char *)xh->xh_entries, (char *)xe, len); xe = &xh->xh_entries[count - start]; len = sizeof(struct ocfs2_xattr_entry) * start; @@ -4788,9 +4788,9 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, BUG_ON(s_blkno == t_blkno); - mlog(0, "cp bucket %llu to %llu, target is %d\n", - (unsigned long long)s_blkno, (unsigned long long)t_blkno, - t_is_new); + trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno, + (unsigned long long)t_blkno, + t_is_new); s_bucket = ocfs2_xattr_bucket_new(inode); t_bucket = ocfs2_xattr_bucket_new(inode); @@ -4862,8 +4862,8 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); struct ocfs2_xattr_bucket *old_first, *new_first; - mlog(0, "mv xattrs from cluster %llu to %llu\n", - (unsigned long long)last_blk, (unsigned long long)to_blk); + trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk, + (unsigned long long)to_blk); BUG_ON(start_bucket >= num_buckets); if (start_bucket) { @@ -5013,9 +5013,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, { int ret; - mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", - (unsigned long long)bucket_blkno(first), prev_clusters, - (unsigned long long)new_blk); + trace_ocfs2_adjust_xattr_cross_cluster( + (unsigned long long)bucket_blkno(first), + (unsigned long long)new_blk, prev_clusters); if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) { ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, @@ -5088,10 +5088,10 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_tree et; - mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " - "previous xattr blkno = %llu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - prev_cpos, (unsigned long long)bucket_blkno(first)); + trace_ocfs2_add_new_xattr_cluster_begin( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)bucket_blkno(first), + prev_cpos, prev_clusters); ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); @@ -5113,8 +5113,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, BUG_ON(num_bits > clusters_to_add); block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", - num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); + trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits); if (bucket_blkno(first) + (prev_clusters * bpc) == block && (prev_clusters + num_bits) << osb->s_clustersize_bits <= @@ -5130,8 +5129,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, */ v_start = prev_cpos + prev_clusters; *num_clusters = prev_clusters + num_bits; - mlog(0, "Add contiguous %u clusters to previous extent rec.\n", - num_bits); } else { ret = ocfs2_adjust_xattr_cross_cluster(inode, handle, @@ -5147,8 +5144,8 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, } } - mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", - num_bits, (unsigned long long)block, v_start); + trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block, + v_start, num_bits); ret = ocfs2_insert_extent(handle, &et, v_start, block, num_bits, 0, ctxt->meta_ac); if (ret < 0) { @@ -5183,9 +5180,9 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode, u64 end_blk; u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets); - mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " - "from %llu, len = %u\n", (unsigned long long)target_blk, - (unsigned long long)bucket_blkno(first), num_clusters); + trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk, + (unsigned long long)bucket_blkno(first), + num_clusters, new_bucket); /* The extent must have room for an additional bucket */ BUG_ON(new_bucket >= @@ -5265,8 +5262,8 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode, /* The bucket at the front of the extent */ struct ocfs2_xattr_bucket *first; - mlog(0, "Add new xattr bucket starting from %llu\n", - (unsigned long long)bucket_blkno(target)); + trace_ocfs2_add_new_xattr_bucket( + (unsigned long long)bucket_blkno(target)); /* The first bucket of the original extent */ first = ocfs2_xattr_bucket_new(inode); @@ -5382,8 +5379,8 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, * modified something. We have to assume they did, and dirty * the whole bucket. This leaves us in a consistent state. */ - mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", - xe_off, (unsigned long long)bucket_blkno(bucket), len); + trace_ocfs2_xattr_bucket_value_truncate( + (unsigned long long)bucket_blkno(bucket), xe_off, len); ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); if (ret) { mlog_errno(ret); @@ -5433,8 +5430,9 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, ocfs2_init_dealloc_ctxt(&dealloc); - mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", - cpos, len, (unsigned long long)blkno); + trace_ocfs2_rm_xattr_cluster( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)blkno, cpos, len); ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno, len); @@ -5538,7 +5536,7 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode, int ret; struct ocfs2_xa_loc loc; - mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name); + trace_ocfs2_xattr_set_entry_bucket(xi->xi_name); ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, xs->not_found ? NULL : xs->here); @@ -5570,7 +5568,6 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode, out: - mlog_exit(ret); return ret; } @@ -5581,7 +5578,7 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode, { int ret; - mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name); + trace_ocfs2_xattr_set_entry_index_block(xi->xi_name); ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt); if (!ret) @@ -5637,7 +5634,6 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode, mlog_errno(ret); out: - mlog_exit(ret); return ret; } @@ -6041,9 +6037,9 @@ static int ocfs2_xattr_bucket_value_refcount(struct inode *inode, if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb))) p = &refcount; - mlog(0, "refcount bucket %llu, count = %u\n", - (unsigned long long)bucket_blkno(bucket), - le16_to_cpu(xh->xh_count)); + trace_ocfs2_xattr_bucket_value_refcount( + (unsigned long long)bucket_blkno(bucket), + le16_to_cpu(xh->xh_count)); for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { xe = &xh->xh_entries[i]; @@ -6339,8 +6335,8 @@ static int ocfs2_reflink_xattr_header(handle_t *handle, u32 clusters, cpos, p_cluster, num_clusters; unsigned int ext_flags = 0; - mlog(0, "reflink xattr in container %llu, count = %u\n", - (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count)); + trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr, + le16_to_cpu(xh->xh_count)); last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)]; for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) { @@ -6540,8 +6536,8 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode, goto out; } - mlog(0, "create new xattr block for inode %llu, index = %d\n", - (unsigned long long)fe_bh->b_blocknr, indexed); + trace_ocfs2_create_empty_xattr_block( + (unsigned long long)fe_bh->b_blocknr, indexed); ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed, ret_bh); if (ret) @@ -6952,8 +6948,8 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, if (ret) mlog_errno(ret); - mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", - (unsigned long long)new_blkno, num_clusters, reflink_cpos); + trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno, + num_clusters, reflink_cpos); len -= num_clusters; blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); @@ -6982,8 +6978,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_extent_tree et; - mlog(0, "reflink xattr buckets %llu len %u\n", - (unsigned long long)blkno, len); + trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len); ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(args->reflink->new_inode), diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7c708a418ac..2e7addfd980 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -182,7 +182,8 @@ static void m_stop(struct seq_file *m, void *v) struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - vma_stop(priv, vma); + if (!IS_ERR(vma)) + vma_stop(priv, vma); if (priv->task) put_task_struct(priv->task); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a2a622e079f..fcc8ae75d87 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -76,7 +76,7 @@ #include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/quotaops.h> -#include <linux/writeback.h> /* for inode_lock, oddly enough.. */ +#include "../internal.h" /* ugh */ #include <asm/uaccess.h> @@ -900,33 +900,38 @@ static void add_dquot_ref(struct super_block *sb, int type) int reserved = 0; #endif - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + !atomic_read(&inode->i_writecount) || + !dqinit_needed(inode, type)) { + spin_unlock(&inode->i_lock); continue; + } #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif - if (!atomic_read(&inode->i_writecount)) - continue; - if (!dqinit_needed(inode, type)) - continue; - __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); __dquot_initialize(inode, type); - /* We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the inode_lock. - * We cannot iput the inode now as we can be holding the last - * reference and we cannot iput it under inode_lock. So we - * keep the reference and iput it later. */ + + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock We cannot iput the inode now as we can be + * holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ old_inode = inode; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); #ifdef CONFIG_QUOTA_DEBUG @@ -1007,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, struct inode *inode; int reserved = 0; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already @@ -1021,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, remove_inode_dquot_ref(inode, type, tofree_head); } } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index c05324d3282..596bb2c9de4 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -94,75 +94,6 @@ xfs_buf_vmap_len( } /* - * Page Region interfaces. - * - * For pages in filesystems where the blocksize is smaller than the - * pagesize, we use the page->private field (long) to hold a bitmap - * of uptodate regions within the page. - * - * Each such region is "bytes per page / bits per long" bytes long. - * - * NBPPR == number-of-bytes-per-page-region - * BTOPR == bytes-to-page-region (rounded up) - * BTOPRT == bytes-to-page-region-truncated (rounded down) - */ -#if (BITS_PER_LONG == 32) -#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ -#elif (BITS_PER_LONG == 64) -#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ -#else -#error BITS_PER_LONG must be 32 or 64 -#endif -#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) -#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) -#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) - -STATIC unsigned long -page_region_mask( - size_t offset, - size_t length) -{ - unsigned long mask; - int first, final; - - first = BTOPR(offset); - final = BTOPRT(offset + length - 1); - first = min(first, final); - - mask = ~0UL; - mask <<= BITS_PER_LONG - (final - first); - mask >>= BITS_PER_LONG - (final); - - ASSERT(offset + length <= PAGE_CACHE_SIZE); - ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); - - return mask; -} - -STATIC void -set_page_region( - struct page *page, - size_t offset, - size_t length) -{ - set_page_private(page, - page_private(page) | page_region_mask(offset, length)); - if (page_private(page) == ~0UL) - SetPageUptodate(page); -} - -STATIC int -test_page_region( - struct page *page, - size_t offset, - size_t length) -{ - unsigned long mask = page_region_mask(offset, length); - - return (mask && (page_private(page) & mask) == mask); -} - -/* * xfs_buf_lru_add - add a buffer to the LRU. * * The LRU takes a new reference to the buffer so that it will only be freed @@ -332,7 +263,7 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_lru)); - if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { + if (bp->b_flags & _XBF_PAGES) { uint i; if (xfs_buf_is_vmapped(bp)) @@ -342,25 +273,22 @@ xfs_buf_free( for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; - if (bp->b_flags & _XBF_PAGE_CACHE) - ASSERT(!PagePrivate(page)); - page_cache_release(page); + __free_page(page); } - } + } else if (bp->b_flags & _XBF_KMEM) + kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); xfs_buf_deallocate(bp); } /* - * Finds all pages for buffer in question and builds it's page list. + * Allocates all the pages for buffer in question and builds it's page list. */ STATIC int -_xfs_buf_lookup_pages( +xfs_buf_allocate_memory( xfs_buf_t *bp, uint flags) { - struct address_space *mapping = bp->b_target->bt_mapping; - size_t blocksize = bp->b_target->bt_bsize; size_t size = bp->b_count_desired; size_t nbytes, offset; gfp_t gfp_mask = xb_to_gfp(flags); @@ -369,29 +297,55 @@ _xfs_buf_lookup_pages( xfs_off_t end; int error; + /* + * for buffers that are contained within a single page, just allocate + * the memory from the heap - there's no need for the complexity of + * page arrays to keep allocation down to order 0. + */ + if (bp->b_buffer_length < PAGE_SIZE) { + bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); + if (!bp->b_addr) { + /* low memory - use alloc_page loop instead */ + goto use_alloc_page; + } + + if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & + PAGE_MASK) != + ((unsigned long)bp->b_addr & PAGE_MASK)) { + /* b_addr spans two pages - use alloc_page instead */ + kmem_free(bp->b_addr); + bp->b_addr = NULL; + goto use_alloc_page; + } + bp->b_offset = offset_in_page(bp->b_addr); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_page_count = 1; + bp->b_flags |= XBF_MAPPED | _XBF_KMEM; + return 0; + } + +use_alloc_page: end = bp->b_file_offset + bp->b_buffer_length; page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); - error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; - bp->b_flags |= _XBF_PAGE_CACHE; offset = bp->b_offset; - first = bp->b_file_offset >> PAGE_CACHE_SHIFT; + first = bp->b_file_offset >> PAGE_SHIFT; + bp->b_flags |= _XBF_PAGES; for (i = 0; i < bp->b_page_count; i++) { struct page *page; uint retries = 0; - - retry: - page = find_or_create_page(mapping, first + i, gfp_mask); +retry: + page = alloc_page(gfp_mask); if (unlikely(page == NULL)) { if (flags & XBF_READ_AHEAD) { bp->b_page_count = i; - for (i = 0; i < bp->b_page_count; i++) - unlock_page(bp->b_pages[i]); - return -ENOMEM; + error = ENOMEM; + goto out_free_pages; } /* @@ -412,33 +366,16 @@ _xfs_buf_lookup_pages( XFS_STATS_INC(xb_page_found); - nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); + nbytes = min_t(size_t, size, PAGE_SIZE - offset); size -= nbytes; - - ASSERT(!PagePrivate(page)); - if (!PageUptodate(page)) { - page_count--; - if (blocksize >= PAGE_CACHE_SIZE) { - if (flags & XBF_READ) - bp->b_flags |= _XBF_PAGE_LOCKED; - } else if (!PagePrivate(page)) { - if (test_page_region(page, offset, nbytes)) - page_count++; - } - } - bp->b_pages[i] = page; offset = 0; } + return 0; - if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { - for (i = 0; i < bp->b_page_count; i++) - unlock_page(bp->b_pages[i]); - } - - if (page_count == bp->b_page_count) - bp->b_flags |= XBF_DONE; - +out_free_pages: + for (i = 0; i < bp->b_page_count; i++) + __free_page(bp->b_pages[i]); return error; } @@ -450,14 +387,23 @@ _xfs_buf_map_pages( xfs_buf_t *bp, uint flags) { - /* A single page buffer is always mappable */ + ASSERT(bp->b_flags & _XBF_PAGES); if (bp->b_page_count == 1) { + /* A single page buffer is always mappable */ bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); - if (unlikely(bp->b_addr == NULL)) + int retried = 0; + + do { + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, + -1, PAGE_KERNEL); + if (bp->b_addr) + break; + vm_unmap_aliases(); + } while (retried++ <= 1); + + if (!bp->b_addr) return -ENOMEM; bp->b_addr += bp->b_offset; bp->b_flags |= XBF_MAPPED; @@ -568,9 +514,14 @@ found: } } + /* + * if the buffer is stale, clear all the external state associated with + * it. We need to keep flags such as how we allocated the buffer memory + * intact here. + */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= XBF_MAPPED; + bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -591,7 +542,7 @@ xfs_buf_get( xfs_buf_flags_t flags) { xfs_buf_t *bp, *new_bp; - int error = 0, i; + int error = 0; new_bp = xfs_buf_allocate(flags); if (unlikely(!new_bp)) @@ -599,7 +550,7 @@ xfs_buf_get( bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); if (bp == new_bp) { - error = _xfs_buf_lookup_pages(bp, flags); + error = xfs_buf_allocate_memory(bp, flags); if (error) goto no_buffer; } else { @@ -608,9 +559,6 @@ xfs_buf_get( return NULL; } - for (i = 0; i < bp->b_page_count; i++) - mark_page_accessed(bp->b_pages[i]); - if (!(bp->b_flags & XBF_MAPPED)) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { @@ -711,8 +659,7 @@ xfs_buf_readahead( { struct backing_dev_info *bdi; - bdi = target->bt_mapping->backing_dev_info; - if (bdi_read_congested(bdi)) + if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read(target, ioff, isize, @@ -790,10 +737,10 @@ xfs_buf_associate_memory( size_t buflen; int page_count; - pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; + pageaddr = (unsigned long)mem & PAGE_MASK; offset = (unsigned long)mem - pageaddr; - buflen = PAGE_CACHE_ALIGN(len + offset); - page_count = buflen >> PAGE_CACHE_SHIFT; + buflen = PAGE_ALIGN(len + offset); + page_count = buflen >> PAGE_SHIFT; /* Free any previous set of page pointers */ if (bp->b_pages) @@ -810,13 +757,12 @@ xfs_buf_associate_memory( for (i = 0; i < bp->b_page_count; i++) { bp->b_pages[i] = mem_to_page((void *)pageaddr); - pageaddr += PAGE_CACHE_SIZE; + pageaddr += PAGE_SIZE; } bp->b_count_desired = len; bp->b_buffer_length = buflen; bp->b_flags |= XBF_MAPPED; - bp->b_flags &= ~_XBF_PAGE_LOCKED; return 0; } @@ -923,20 +869,7 @@ xfs_buf_rele( /* - * Mutual exclusion on buffers. Locking model: - * - * Buffers associated with inodes for which buffer locking - * is not enabled are not protected by semaphores, and are - * assumed to be exclusively owned by the caller. There is a - * spinlock in the buffer, used by the caller when concurrent - * access is possible. - */ - -/* - * Locks a buffer object, if it is not already locked. Note that this in - * no way locks the underlying pages, so it is only useful for - * synchronizing concurrent use of buffer objects, not for synchronizing - * independent access to the underlying pages. + * Lock a buffer object, if it is not already locked. * * If we come across a stale, pinned, locked buffer, we know that we are * being asked to lock a buffer that has been reallocated. Because it is @@ -970,10 +903,7 @@ xfs_buf_lock_value( } /* - * Locks a buffer object. - * Note that this in no way locks the underlying pages, so it is only - * useful for synchronizing concurrent use of buffer objects, not for - * synchronizing independent access to the underlying pages. + * Lock a buffer object. * * If we come across a stale, pinned, locked buffer, we know that we * are being asked to lock a buffer that has been reallocated. Because @@ -1246,10 +1176,8 @@ _xfs_buf_ioend( xfs_buf_t *bp, int schedule) { - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { - bp->b_flags &= ~_XBF_PAGE_LOCKED; + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) xfs_buf_ioend(bp, schedule); - } } STATIC void @@ -1258,35 +1186,12 @@ xfs_buf_bio_end_io( int error) { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - unsigned int blocksize = bp->b_target->bt_bsize; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; xfs_buf_ioerror(bp, -error); if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); - do { - struct page *page = bvec->bv_page; - - ASSERT(!PagePrivate(page)); - if (unlikely(bp->b_error)) { - if (bp->b_flags & XBF_READ) - ClearPageUptodate(page); - } else if (blocksize >= PAGE_CACHE_SIZE) { - SetPageUptodate(page); - } else if (!PagePrivate(page) && - (bp->b_flags & _XBF_PAGE_CACHE)) { - set_page_region(page, bvec->bv_offset, bvec->bv_len); - } - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (bp->b_flags & _XBF_PAGE_LOCKED) - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - _xfs_buf_ioend(bp, 1); bio_put(bio); } @@ -1300,7 +1205,6 @@ _xfs_buf_ioapply( int offset = bp->b_offset; int size = bp->b_count_desired; sector_t sector = bp->b_bn; - unsigned int blocksize = bp->b_target->bt_bsize; total_nr_pages = bp->b_page_count; map_i = 0; @@ -1321,29 +1225,6 @@ _xfs_buf_ioapply( (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; } - /* Special code path for reading a sub page size buffer in -- - * we populate up the whole page, and hence the other metadata - * in the same page. This optimization is only valid when the - * filesystem block size is not smaller than the page size. - */ - if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && - ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) == - (XBF_READ|_XBF_PAGE_LOCKED)) && - (blocksize >= PAGE_CACHE_SIZE)) { - bio = bio_alloc(GFP_NOIO, 1); - - bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector - (offset >> BBSHIFT); - bio->bi_end_io = xfs_buf_bio_end_io; - bio->bi_private = bp; - - bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0); - size = 0; - - atomic_inc(&bp->b_io_remaining); - - goto submit_io; - } next_chunk: atomic_inc(&bp->b_io_remaining); @@ -1357,8 +1238,9 @@ next_chunk: bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; + for (; size && nr_pages; nr_pages--, map_i++) { - int rbytes, nbytes = PAGE_CACHE_SIZE - offset; + int rbytes, nbytes = PAGE_SIZE - offset; if (nbytes > size) nbytes = size; @@ -1373,7 +1255,6 @@ next_chunk: total_nr_pages--; } -submit_io: if (likely(bio->bi_size)) { if (xfs_buf_is_vmapped(bp)) { flush_kernel_vmap_range(bp->b_addr, @@ -1383,18 +1264,7 @@ submit_io: if (size) goto next_chunk; } else { - /* - * if we get here, no pages were added to the bio. However, - * we can't just error out here - if the pages are locked then - * we have to unlock them otherwise we can hang on a later - * access to the page. - */ xfs_buf_ioerror(bp, EIO); - if (bp->b_flags & _XBF_PAGE_LOCKED) { - int i; - for (i = 0; i < bp->b_page_count; i++) - unlock_page(bp->b_pages[i]); - } bio_put(bio); } } @@ -1458,8 +1328,8 @@ xfs_buf_offset( return XFS_BUF_PTR(bp) + offset; offset += bp->b_offset; - page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); + page = bp->b_pages[offset >> PAGE_SHIFT]; + return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); } /* @@ -1481,9 +1351,9 @@ xfs_buf_iomove( page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; cpoff = xfs_buf_poff(boff + bp->b_offset); csize = min_t(size_t, - PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); + PAGE_SIZE-cpoff, bp->b_count_desired-boff); - ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); + ASSERT(((csize + cpoff) <= PAGE_SIZE)); switch (mode) { case XBRW_ZERO: @@ -1596,7 +1466,6 @@ xfs_free_buftarg( xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - iput(btp->bt_mapping->host); kthread_stop(btp->bt_task); kmem_free(btp); @@ -1620,15 +1489,6 @@ xfs_setsize_buftarg_flags( return EINVAL; } - if (verbose && - (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { - printk(KERN_WARNING - "XFS: %u byte sectors in use on device %s. " - "This is suboptimal; %u or greater is ideal.\n", - sectorsize, XFS_BUFTARG_NAME(btp), - (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); - } - return 0; } @@ -1643,7 +1503,7 @@ xfs_setsize_buftarg_early( struct block_device *bdev) { return xfs_setsize_buftarg_flags(btp, - PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); + PAGE_SIZE, bdev_logical_block_size(bdev), 0); } int @@ -1656,40 +1516,6 @@ xfs_setsize_buftarg( } STATIC int -xfs_mapping_buftarg( - xfs_buftarg_t *btp, - struct block_device *bdev) -{ - struct backing_dev_info *bdi; - struct inode *inode; - struct address_space *mapping; - static const struct address_space_operations mapping_aops = { - .migratepage = fail_migrate_page, - }; - - inode = new_inode(bdev->bd_inode->i_sb); - if (!inode) { - printk(KERN_WARNING - "XFS: Cannot allocate mapping inode for device %s\n", - XFS_BUFTARG_NAME(btp)); - return ENOMEM; - } - inode->i_ino = get_next_ino(); - inode->i_mode = S_IFBLK; - inode->i_bdev = bdev; - inode->i_rdev = bdev->bd_dev; - bdi = blk_get_backing_dev_info(bdev); - if (!bdi) - bdi = &default_backing_dev_info; - mapping = &inode->i_data; - mapping->a_ops = &mapping_aops; - mapping->backing_dev_info = bdi; - mapping_set_gfp_mask(mapping, GFP_NOFS); - btp->bt_mapping = mapping; - return 0; -} - -STATIC int xfs_alloc_delwrite_queue( xfs_buftarg_t *btp, const char *fsname) @@ -1717,12 +1543,14 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; + btp->bt_bdi = blk_get_backing_dev_info(bdev); + if (!btp->bt_bdi) + goto error; + INIT_LIST_HEAD(&btp->bt_lru); spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_mapping_buftarg(btp, bdev)) - goto error; if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index cbe65950e52..a9a1c451264 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -61,30 +61,11 @@ typedef enum { #define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ /* flags used only internally */ -#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */ #define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ #define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ +#define _XBF_KMEM (1 << 20)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ -/* - * Special flag for supporting metadata blocks smaller than a FSB. - * - * In this case we can have multiple xfs_buf_t on a single page and - * need to lock out concurrent xfs_buf_t readers as they only - * serialise access to the buffer. - * - * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation - * between reads of the page. Hence we can have one thread read the - * page and modify it, but then race with another thread that thinks - * the page is not up-to-date and hence reads it again. - * - * The result is that the first modifcation to the page is lost. - * This sort of AGF/AGI reading race can happen when unlinking inodes - * that require truncation and results in the AGI unlinked list - * modifications being lost. - */ -#define _XBF_PAGE_LOCKED (1 << 22) - typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ @@ -100,12 +81,10 @@ typedef unsigned int xfs_buf_flags_t; { XBF_LOCK, "LOCK" }, /* should never be set */\ { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ - { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ - { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_PAGE_LOCKED, "PAGE_LOCKED" } - + { _XBF_KMEM, "KMEM" }, \ + { _XBF_DELWRI_Q, "DELWRI_Q" } typedef enum { XBT_FORCE_SLEEP = 0, @@ -120,7 +99,7 @@ typedef struct xfs_bufhash { typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; - struct address_space *bt_mapping; + struct backing_dev_info *bt_bdi; struct xfs_mount *bt_mount; unsigned int bt_bsize; unsigned int bt_sshift; @@ -139,17 +118,6 @@ typedef struct xfs_buftarg { unsigned int bt_lru_nr; } xfs_buftarg_t; -/* - * xfs_buf_t: Buffer structure for pagecache-based buffers - * - * This buffer structure is used by the pagecache buffer management routines - * to refer to an assembly of pages forming a logical buffer. - * - * The buffer structure is used on a temporary basis only, and discarded when - * released. The real data storage is recorded in the pagecache. Buffers are - * hashed to the block device on which the file system resides. - */ - struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index a55c1b46b21..52aadfbed13 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -896,6 +896,7 @@ xfs_file_fallocate( xfs_flock64_t bf; xfs_inode_t *ip = XFS_I(inode); int cmd = XFS_IOC_RESVSP; + int attr_flags = XFS_ATTR_NOLOCK; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; @@ -918,7 +919,10 @@ xfs_file_fallocate( goto out_unlock; } - error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); + if (file->f_flags & O_DSYNC) + attr_flags |= XFS_ATTR_SYNC; + + error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags); if (error) goto out_unlock; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 0ca0e3c024d..acca2c5ca3f 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -624,6 +624,10 @@ xfs_ioc_space( if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) attr_flags |= XFS_ATTR_NONBLOCK; + + if (filp->f_flags & O_DSYNC) + attr_flags |= XFS_ATTR_SYNC; + if (ioflags & IO_INVIS) attr_flags |= XFS_ATTR_DMI; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 818c4cf2de8..1ba5c451da3 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1078,7 +1078,7 @@ xfs_fs_write_inode( error = 0; goto out_unlock; } - error = xfs_iflush(ip, 0); + error = xfs_iflush(ip, SYNC_TRYLOCK); } out_unlock: @@ -1539,10 +1539,14 @@ xfs_fs_fill_super( if (error) goto out_free_sb; - error = xfs_mountfs(mp); - if (error) - goto out_filestream_unmount; - + /* + * we must configure the block size in the superblock before we run the + * full mount process as the mount process can lookup and cache inodes. + * For the same reason we must also initialise the syncd and register + * the inode cache shrinker so that inodes can be reclaimed during + * operations like a quotacheck that iterate all inodes in the + * filesystem. + */ sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; @@ -1550,6 +1554,16 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); + error = xfs_syncd_init(mp); + if (error) + goto out_filestream_unmount; + + xfs_inode_shrinker_register(mp); + + error = xfs_mountfs(mp); + if (error) + goto out_syncd_stop; + root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; @@ -1565,14 +1579,11 @@ xfs_fs_fill_super( goto fail_vnrele; } - error = xfs_syncd_init(mp); - if (error) - goto fail_vnrele; - - xfs_inode_shrinker_register(mp); - return 0; + out_syncd_stop: + xfs_inode_shrinker_unregister(mp); + xfs_syncd_stop(mp); out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: @@ -1596,6 +1607,9 @@ xfs_fs_fill_super( } fail_unmount: + xfs_inode_shrinker_unregister(mp); + xfs_syncd_stop(mp); + /* * Blow away any referenced inode in the filestreams cache. * This can and will cause log traffic as inodes go inactive diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 6c10f1d2e3d..594cd822d84 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -761,8 +761,10 @@ xfs_reclaim_inode( struct xfs_perag *pag, int sync_mode) { - int error = 0; + int error; +restart: + error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) @@ -788,9 +790,31 @@ xfs_reclaim_inode( if (xfs_inode_clean(ip)) goto reclaim; - /* Now we have an inode that needs flushing */ - error = xfs_iflush(ip, sync_mode); + /* + * Now we have an inode that needs flushing. + * + * We do a nonblocking flush here even if we are doing a SYNC_WAIT + * reclaim as we can deadlock with inode cluster removal. + * xfs_ifree_cluster() can lock the inode buffer before it locks the + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_itobp() to get the cluster buffer will result + * in an ABBA deadlock with xfs_ifree_cluster(). + * + * As xfs_ifree_cluser() must gather all inodes that are active in the + * cache to mark them stale, if we hit this case we don't actually want + * to do IO here - we want the inode marked stale so we can simply + * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, + * just unlock the inode, back off and try again. Hopefully the next + * pass through will see the stale flag set on the inode. + */ + error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); if (sync_mode & SYNC_WAIT) { + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; + } xfs_iflock(ip); goto reclaim; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index da871f53223..742c8330994 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2835,7 +2835,7 @@ xfs_iflush( * Get the buffer containing the on-disk inode. */ error = xfs_itobp(mp, NULL, ip, &dip, &bp, - (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); + (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); if (error || !bp) { xfs_ifunlock(ip); return error; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index fd4f398bd6f..46cc40131d4 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -760,11 +760,11 @@ xfs_inode_item_push( * Push the inode to it's backing buffer. This will not remove the * inode from the AIL - a further push will be required to trigger a * buffer push. However, this allows all the dirty inodes to be pushed - * to the buffer before it is pushed to disk. THe buffer IO completion - * will pull th einode from the AIL, mark it clean and unlock the flush + * to the buffer before it is pushed to disk. The buffer IO completion + * will pull the inode from the AIL, mark it clean and unlock the flush * lock. */ - (void) xfs_iflush(ip, 0); + (void) xfs_iflush(ip, SYNC_TRYLOCK); xfs_iunlock(ip, XFS_ILOCK_SHARED); } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3bea6613233..03b3b7f85a3 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -383,7 +383,8 @@ xfs_trans_read_buf( bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); if (bp == NULL) { *bpp = NULL; - return 0; + return (flags & XBF_TRYLOCK) ? + 0 : XFS_ERROR(ENOMEM); } if (XFS_BUF_GETERROR(bp) != 0) { XFS_BUF_SUPER_STALE(bp); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 37d8146ee15..c48b4217ec4 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2831,7 +2831,8 @@ xfs_change_file_space( ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); + if (attr_flags & XFS_ATTR_SYNC) + xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index f6702927eee..3bcd23353d6 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -18,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags); #define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ #define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ #define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ +#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */ int xfs_readlink(struct xfs_inode *ip, char *link); int xfs_release(struct xfs_inode *ip); |