From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2011 08:52:07 +0100 Subject: block: remove per-queue plugging Code has been converted over to the new explicit on-stack plugging, and delay users have been converted to use the new API for that. So lets kill off the old plugging along with aops->sync_page(). Signed-off-by: Jens Axboe --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb9bd7832b6..462e08e724b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7218,7 +7218,6 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readpages = btrfs_readpages, - .sync_page = block_sync_page, .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, -- cgit v1.2.3-70-g09d2 From 57a45ced94fe48a701361d64230fc16eefa189dd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Jan 2011 16:30:38 -0500 Subject: Btrfs: change reserved_extents to an atomic_t We track delayed allocation per inodes via 2 counters, one is outstanding_extents and reserved_extents. Outstanding_extents is already an atomic_t, but reserved_extents is not and is protected by a spinlock. So convert this to an atomic_t and instead of using a spinlock, use atomic_cmpxchg when releasing delalloc bytes. This makes our inode 72 bytes smaller, and reduces locking overhead (albiet it was minimal to begin with). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 3 +-- fs/btrfs/extent-tree.c | 42 ++++++++++++++++++++++++++---------------- fs/btrfs/inode.c | 5 ++--- 3 files changed, 29 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ccc991c542d..57c3bb2884c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -136,9 +136,8 @@ struct btrfs_inode { * items we think we'll end up using, and reserved_extents is the number * of extent items we've reserved metadata for. */ - spinlock_t accounting_lock; atomic_t outstanding_extents; - int reserved_extents; + atomic_t reserved_extents; /* * ordered_data_close is set by truncate when a file that used diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7b3089b5c2d..27376c97d85 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3996,6 +3996,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve; int nr_extents; + int reserved_extents; int ret; if (btrfs_transaction_in_commit(root->fs_info)) @@ -4003,25 +4004,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); - spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; - if (nr_extents > BTRFS_I(inode)->reserved_extents) { - nr_extents -= BTRFS_I(inode)->reserved_extents; + reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + + if (nr_extents > reserved_extents) { + nr_extents -= reserved_extents; to_reserve = calc_trans_metadata_size(root, nr_extents); } else { nr_extents = 0; to_reserve = 0; } - spin_unlock(&BTRFS_I(inode)->accounting_lock); + to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) return ret; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->reserved_extents += nr_extents; + atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents); atomic_inc(&BTRFS_I(inode)->outstanding_extents); - spin_unlock(&BTRFS_I(inode)->accounting_lock); block_rsv_add_bytes(block_rsv, to_reserve, 1); @@ -4036,20 +4036,30 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) struct btrfs_root *root = BTRFS_I(inode)->root; u64 to_free; int nr_extents; + int reserved_extents; num_bytes = ALIGN(num_bytes, root->sectorsize); atomic_dec(&BTRFS_I(inode)->outstanding_extents); WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); - spin_lock(&BTRFS_I(inode)->accounting_lock); - nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); - if (nr_extents < BTRFS_I(inode)->reserved_extents) { - nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; - BTRFS_I(inode)->reserved_extents -= nr_extents; - } else { - nr_extents = 0; - } - spin_unlock(&BTRFS_I(inode)->accounting_lock); + reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + do { + int old, new; + + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); + if (nr_extents >= reserved_extents) { + nr_extents = 0; + break; + } + old = reserved_extents; + nr_extents = reserved_extents - nr_extents; + new = reserved_extents - nr_extents; + old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents, + reserved_extents, new); + if (likely(old == reserved_extents)) + break; + reserved_extents = old; + } while (1); to_free = calc_csum_metadata_size(inode, num_bytes); if (nr_extents > 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9007bbd01db..d97b69afbbf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6632,9 +6632,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; - spin_lock_init(&ei->accounting_lock); atomic_set(&ei->outstanding_extents, 0); - ei->reserved_extents = 0; + atomic_set(&ei->reserved_extents, 0); ei->ordered_data_close = 0; ei->orphan_meta_reserved = 0; @@ -6670,7 +6669,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); - WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); /* * This can happen where we create an inode, but somebody else also -- cgit v1.2.3-70-g09d2 From dc89e9824464e91fa0b06267864ceabe3186fd8b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Jan 2011 17:05:48 -0500 Subject: Btrfs: use a slab for the free space entries Since we alloc/free free space entries a whole lot, lets use a slab to keep track of them. This makes some of my tests slightly faster. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 1 + fs/btrfs/free-space-cache.c | 34 ++++++++++++++++++---------------- fs/btrfs/inode.c | 10 ++++++++++ 3 files changed, 29 insertions(+), 16 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7f78cc78fdd..2c98d209e6a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -40,6 +40,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; +extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a0390657451..0282033041e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -393,7 +393,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, break; need_loop = 1; - e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); if (!e) { kunmap(page); unlock_page(page); @@ -405,7 +406,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, e->bytes = le64_to_cpu(entry->bytes); if (!e->bytes) { kunmap(page); - kfree(e); + kmem_cache_free(btrfs_free_space_cachep, e); unlock_page(page); page_cache_release(page); goto free_cache; @@ -420,7 +421,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); if (!e->bitmap) { kunmap(page); - kfree(e); + kmem_cache_free( + btrfs_free_space_cachep, e); unlock_page(page); page_cache_release(page); goto free_cache; @@ -1187,7 +1189,7 @@ static void free_bitmap(struct btrfs_block_group_cache *block_group, { unlink_free_space(block_group, bitmap_info); kfree(bitmap_info->bitmap); - kfree(bitmap_info); + kmem_cache_free(btrfs_free_space_cachep, bitmap_info); block_group->total_bitmaps--; recalculate_thresholds(block_group); } @@ -1342,8 +1344,8 @@ new_bitmap: /* no pre-allocated info, allocate a new one */ if (!info) { - info = kzalloc(sizeof(struct btrfs_free_space), - GFP_NOFS); + info = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); if (!info) { spin_lock(&block_group->tree_lock); ret = -ENOMEM; @@ -1365,7 +1367,7 @@ out: if (info) { if (info->bitmap) kfree(info->bitmap); - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); } return ret; @@ -1398,7 +1400,7 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group, else __unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; - kfree(right_info); + kmem_cache_free(btrfs_free_space_cachep, right_info); merged = true; } @@ -1410,7 +1412,7 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group, __unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; - kfree(left_info); + kmem_cache_free(btrfs_free_space_cachep, left_info); merged = true; } @@ -1423,7 +1425,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *info; int ret = 0; - info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; @@ -1450,7 +1452,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, link: ret = link_free_space(block_group, info); if (ret) - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); out: spin_unlock(&block_group->tree_lock); @@ -1520,7 +1522,7 @@ again: kfree(info->bitmap); block_group->total_bitmaps--; } - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); goto out_lock; } @@ -1556,7 +1558,7 @@ again: /* the hole we're creating ends at the end * of the info struct, just free the info */ - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); } spin_unlock(&block_group->tree_lock); @@ -1689,7 +1691,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) unlink_free_space(block_group, info); if (info->bitmap) kfree(info->bitmap); - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); if (need_resched()) { spin_unlock(&block_group->tree_lock); cond_resched(); @@ -1722,7 +1724,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, entry->offset += bytes; entry->bytes -= bytes; if (!entry->bytes) - kfree(entry); + kmem_cache_free(btrfs_free_space_cachep, entry); else link_free_space(block_group, entry); } @@ -1884,7 +1886,7 @@ out: block_group->free_space -= bytes; if (entry->bytes == 0) { block_group->free_extents--; - kfree(entry); + kmem_cache_free(btrfs_free_space_cachep, entry); } spin_unlock(&block_group->tree_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d97b69afbbf..2d2e079713d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@ #include "tree-log.h" #include "compression.h" #include "locking.h" +#include "free-space-cache.h" struct btrfs_iget_args { u64 ino; @@ -70,6 +71,7 @@ static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; +struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { @@ -6761,6 +6763,8 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_transaction_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); + if (btrfs_free_space_cachep) + kmem_cache_destroy(btrfs_free_space_cachep); } int btrfs_init_cachep(void) @@ -6789,6 +6793,12 @@ int btrfs_init_cachep(void) if (!btrfs_path_cachep) goto fail; + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", + sizeof(struct btrfs_free_space), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); -- cgit v1.2.3-70-g09d2 From a41ad394a03b802497958d7c98a9dcf607266645 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 31 Jan 2011 15:30:16 -0500 Subject: Btrfs: convert to the new truncate sequence ->truncate() is going away, instead all of the work needs to be done in ->setattr(). So this converts us over to do this. It's fairly straightforward, just get rid of our .truncate inode operation and call btrfs_truncate() directly from btrfs_setsize. This works out better for us since truncate can technically return ENOSPC, and before we had no way of letting anybody know. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 5 ++-- fs/btrfs/inode.c | 80 +++++++++++++++++++++++++------------------------------- 3 files changed, 40 insertions(+), 47 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2c98d209e6a..34142d5647d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2537,7 +2537,7 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_cont_expand(struct inode *inode, loff_t size); +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 24a19c2743c..3786eca2a90 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -817,7 +817,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; if (start_pos > inode->i_size) { - err = btrfs_cont_expand(inode, start_pos); + err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); if (err) return err; } @@ -1330,7 +1330,8 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, alloc_start); + ret = btrfs_cont_expand(inode, i_size_read(inode), + alloc_start); if (ret) goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2d2e079713d..3662ffec17d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -84,7 +84,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static void btrfs_truncate(struct inode *inode); +static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, @@ -2371,6 +2372,11 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { + if (!S_ISREG(inode->i_mode)) { + WARN_ON(1); + iput(inode); + continue; + } nr_truncate++; btrfs_truncate(inode); } else { @@ -3538,7 +3544,7 @@ out: return ret; } -int btrfs_cont_expand(struct inode *inode, loff_t size) +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3546,7 +3552,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 mask = root->sectorsize - 1; - u64 hole_start = (inode->i_size + mask) & ~mask; + u64 hole_start = (oldsize + mask) & ~mask; u64 block_end = (size + mask) & ~mask; u64 last_byte; u64 cur_offset; @@ -3617,27 +3623,17 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) return err; } -static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) +static int btrfs_setsize(struct inode *inode, loff_t newsize) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; + loff_t oldsize = i_size_read(inode); unsigned long nr; int ret; - if (attr->ia_size == inode->i_size) + if (newsize == oldsize) return 0; - if (attr->ia_size > inode->i_size) { - unsigned long limit; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (attr->ia_size > inode->i_sb->s_maxbytes) - return -EFBIG; - if (limit != RLIM_INFINITY && attr->ia_size > limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - } - trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3651,16 +3647,16 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); - if (attr->ia_size > inode->i_size) { - ret = btrfs_cont_expand(inode, attr->ia_size); + if (newsize > oldsize) { + i_size_write(inode, newsize); + btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + truncate_pagecache(inode, oldsize, newsize); + ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { - btrfs_truncate(inode); + btrfs_setsize(inode, oldsize); return ret; } - i_size_write(inode, attr->ia_size); - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 0); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); @@ -3676,22 +3672,22 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); - return 0; - } + } else { - /* - * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. - */ - if (attr->ia_size == 0) - BTRFS_I(inode)->ordered_data_close = 1; + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (newsize == 0) + BTRFS_I(inode)->ordered_data_close = 1; - /* we don't support swapfiles, so vmtruncate shouldn't fail */ - ret = vmtruncate(inode, attr->ia_size); - BUG_ON(ret); + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + truncate_setsize(inode, newsize); + ret = btrfs_truncate(inode); + } - return 0; + return ret; } static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) @@ -3708,7 +3704,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setattr_size(inode, attr); + err = btrfs_setsize(inode, attr->ia_size); if (err) return err; } @@ -6478,7 +6474,7 @@ out: return ret; } -static void btrfs_truncate(struct inode *inode) +static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -6486,14 +6482,9 @@ static void btrfs_truncate(struct inode *inode) unsigned long nr; u64 mask = root->sectorsize - 1; - if (!S_ISREG(inode->i_mode)) { - WARN_ON(1); - return; - } - ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) - return; + return ret; btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); @@ -6568,6 +6559,8 @@ static void btrfs_truncate(struct inode *inode) ret = btrfs_end_transaction_throttle(trans, root); BUG_ON(ret); btrfs_btree_balance_dirty(root, nr); + + return ret; } /* @@ -7367,7 +7360,6 @@ static const struct address_space_operations btrfs_symlink_aops = { }; static const struct inode_operations btrfs_file_inode_operations = { - .truncate = btrfs_truncate, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .setxattr = btrfs_setxattr, -- cgit v1.2.3-70-g09d2 From 3893e33b0bebee2f67d96b6c15259dc884523c20 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 31 Jan 2011 16:03:11 -0500 Subject: Btrfs: cleanup error handling in the truncate path Now that we can handle having errors in the truncate path lets make sure we return errors instead of doing BUG_ON() and such. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 64 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3662ffec17d..d83025063ee 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3597,13 +3597,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) err = btrfs_drop_extents(trans, inode, cur_offset, cur_offset + hole_size, &hint_byte, 1); - BUG_ON(err); + if (err) + break; err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); - BUG_ON(err); + if (err) + break; btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); @@ -3641,7 +3643,10 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) btrfs_set_trans_block_group(trans, inode); ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } nr = trans->blocks_used; btrfs_end_transaction(trans, root); @@ -3658,17 +3663,24 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) } trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; BUG_ON(!trans->block_rsv); + /* + * If this fails just leave the orphan item so that it can get + * cleaned up next time we mount. + */ ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - if (inode->i_nlink > 0) { - ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; } + if (inode->i_nlink > 0) + ret = btrfs_orphan_del(trans, inode); nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); @@ -6478,6 +6490,7 @@ static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + int err = 0; struct btrfs_trans_handle *trans; unsigned long nr; u64 mask = root->sectorsize - 1; @@ -6490,7 +6503,8 @@ static int btrfs_truncate(struct inode *inode) btrfs_ordered_update_i_size(inode, inode->i_size, NULL); trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; @@ -6517,29 +6531,38 @@ static int btrfs_truncate(struct inode *inode) while (1) { if (!trans) { trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; } ret = btrfs_block_rsv_check(trans, root, root->orphan_block_rsv, 0, 5); - if (ret) { - BUG_ON(ret != -EAGAIN); + if (ret == -EAGAIN) { ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (ret) + return ret; trans = NULL; continue; + } else if (ret) { + err = ret; + break; } ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -EAGAIN) + if (ret != -EAGAIN) { + err = ret; break; + } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + if (ret) { + err = ret; + break; + } nr = trans->blocks_used; btrfs_end_transaction(trans, root); @@ -6549,18 +6572,21 @@ static int btrfs_truncate(struct inode *inode) if (ret == 0 && inode->i_nlink > 0) { ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); + if (ret) + err = ret; } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + if (ret && !err) + err = ret; nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); - BUG_ON(ret); + if (ret && !err) + err = ret; btrfs_btree_balance_dirty(root, nr); - return ret; + return err; } /* -- cgit v1.2.3-70-g09d2 From 66b4ffd110f9b48b8d8c1319ee446b53b8d073bf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 31 Jan 2011 16:22:42 -0500 Subject: Btrfs: handle errors in btrfs_orphan_cleanup If we cannot truncate an inode for some reason we will never delete the orphan item associated with that inode, which means that we will loop forever in btrfs_orphan_cleanup. Instead of doing this just return error so we fail to mount. It sucks, but hey it's better than hanging. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 15 ++++++++++++--- fs/btrfs/extent-tree.c | 3 ++- fs/btrfs/inode.c | 47 +++++++++++++++++++++++++++++++---------------- fs/btrfs/ioctl.c | 4 +++- fs/btrfs/relocation.c | 2 +- 6 files changed, 50 insertions(+), 23 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 34142d5647d..841330f3d68 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2529,7 +2529,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); -void btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e1aa8d607bc..495b1ac45f8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2058,9 +2058,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY)) { down_read(&fs_info->cleanup_work_sem); - btrfs_orphan_cleanup(fs_info->fs_root); - btrfs_orphan_cleanup(fs_info->tree_root); + err = btrfs_orphan_cleanup(fs_info->fs_root); + if (!err) + err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); + if (err) { + close_ctree(tree_root); + return ERR_PTR(err); + } } return tree_root; @@ -2435,8 +2440,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { + int err; + root_objectid = gang[i]->root_key.objectid; - btrfs_orphan_cleanup(gang[i]); + err = btrfs_orphan_cleanup(gang[i]); + if (err) + return err; } root_objectid++; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 27376c97d85..a8f4e8d2ba6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7619,7 +7619,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root) reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); BUG_ON(!reloc_root); - btrfs_orphan_cleanup(reloc_root); + ret = btrfs_orphan_cleanup(reloc_root); + BUG_ON(ret); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d83025063ee..0600265cb9b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2284,7 +2284,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) * this cleans up any orphans that may be left on the list from the last use * of this root. */ -void btrfs_orphan_cleanup(struct btrfs_root *root) +int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -2294,10 +2294,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) int ret = 0, nr_unlink = 0, nr_truncate = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) - return; + return 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto out; + } path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; @@ -2306,11 +2309,8 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - printk(KERN_ERR "Error searching slot for orphan: %d" - "\n", ret); - break; - } + if (ret < 0) + goto out; /* * if ret == 0 means we found what we were searching for, which @@ -2318,6 +2318,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * find the key and see if we have stuff that matches */ if (ret > 0) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -2345,7 +2346,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - BUG_ON(IS_ERR(inode)); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } /* * add this inode to the orphan list so btrfs_orphan_del does @@ -2363,7 +2367,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) */ if (is_bad_inode(inode)) { trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); @@ -2378,16 +2385,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; - btrfs_truncate(inode); + ret = btrfs_truncate(inode); } else { nr_unlink++; } /* this will do delete_inode and everything for us */ iput(inode); + if (ret) + goto out; } - btrfs_free_path(path); - root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) @@ -2396,14 +2403,20 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) if (root->orphan_block_rsv || root->orphan_item_inserted) { trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - btrfs_end_transaction(trans, root); + if (!IS_ERR(trans)) + btrfs_end_transaction(trans, root); } if (nr_unlink) printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); if (nr_truncate) printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + +out: + if (ret) + printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); + btrfs_free_path(path); + return ret; } /* @@ -4156,8 +4169,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (!IS_ERR(inode) && root != sub_root) { down_read(&root->fs_info->cleanup_work_sem); if (!(inode->i_sb->s_flags & MS_RDONLY)) - btrfs_orphan_cleanup(sub_root); + ret = btrfs_orphan_cleanup(sub_root); up_read(&root->fs_info->cleanup_work_sem); + if (ret) + inode = ERR_PTR(ret); } return inode; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5fdb2abc4fa..ad9b8c0e930 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -409,7 +409,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (ret) goto fail; - btrfs_orphan_cleanup(pending_snapshot->snap); + ret = btrfs_orphan_cleanup(pending_snapshot->snap); + if (ret) + goto fail; parent = dget_parent(dentry); inode = btrfs_lookup_dentry(parent->d_inode, dentry); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 31ade5802ae..c863c844701 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4209,7 +4209,7 @@ out: if (IS_ERR(fs_root)) err = PTR_ERR(fs_root); else - btrfs_orphan_cleanup(fs_root); + err = btrfs_orphan_cleanup(fs_root); } return err; } -- cgit v1.2.3-70-g09d2 From ded5db9de78f963979e1605f859de67626f54693 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 4 Mar 2011 14:09:46 -0500 Subject: Btrfs: make sure to remove the orphan item from the in-memory list This fixes a problem where if truncate fails the inode will still be on the in memory orphan list. This is will make us complain when the inode gets destroyed because it's still on the orphan list. So if we fail just remove us from the in memory list and carry on. Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0600265cb9b..3bd0ff63bf3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6589,6 +6589,12 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; + } else if (ret && inode->i_nlink > 0) { + /* + * Failed to do the truncate, remove us from the in memory + * orphan list. + */ + ret = btrfs_orphan_del(NULL, inode); } ret = btrfs_update_inode(trans, root, inode); -- cgit v1.2.3-70-g09d2 From f0cd846e9221811d87047f1428cf5226e7236efe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 4 Mar 2011 14:37:08 -0500 Subject: Btrfs: only add orphan items when truncating We don't need an orphan item when expanding files, we just need them for truncating them, so only add the orphan item in btrfs_truncate instead of in btrfs_setsize. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3bd0ff63bf3..206b60362ce 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3649,22 +3649,6 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) if (newsize == oldsize) return 0; - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - btrfs_set_trans_block_group(trans, inode); - - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_end_transaction(trans, root); - return ret; - } - - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - if (newsize > oldsize) { i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); @@ -3675,25 +3659,15 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) return ret; } - trans = btrfs_start_transaction(root, 0); + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); - trans->block_rsv = root->orphan_block_rsv; - BUG_ON(!trans->block_rsv); - - /* - * If this fails just leave the orphan item so that it can get - * cleaned up next time we mount. - */ ret = btrfs_update_inode(trans, root, inode); if (ret) { btrfs_end_transaction(trans, root); return ret; } - if (inode->i_nlink > 0) - ret = btrfs_orphan_del(trans, inode); nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); @@ -6517,6 +6491,23 @@ static int btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_orphan_add(trans, inode); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + /* Now start a transaction for the truncate */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); -- cgit v1.2.3-70-g09d2 From 930f028abe39dfd0849b53131d19c4b67aacbe67 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 4 Mar 2011 14:41:41 -0500 Subject: Btrfs: use mark_inode_dirty when expanding the file Mark_inode_dirty will call btrfs_dirty_inode which will take care of updating the inode. This makes setsize a little cleaner since we don't have to start a transaction and update the inode in there, we can just call mark_inode_dirty. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 206b60362ce..64d57e032b4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3640,10 +3640,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) static int btrfs_setsize(struct inode *inode, loff_t newsize) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); - unsigned long nr; int ret; if (newsize == oldsize) @@ -3659,18 +3656,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) return ret; } - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_end_transaction(trans, root); - return ret; - } - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); + mark_inode_dirty(inode); } else { /* -- cgit v1.2.3-70-g09d2 From 695a0d0da09e75c4475bbb303def159023ef72ca Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 4 Mar 2011 15:46:53 -0500 Subject: Btrfs: add a comment explaining what btrfs_cont_expand does Everytime I have to deal with btrfs_cont_expand I stare at it for 20 minutes trying to remember what exactly it does and why the hell we need it. So add a comment to save future-Josef some time. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 64d57e032b4..888dbdb3b12 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3557,6 +3557,12 @@ out: return ret; } +/* + * This function puts in dummy file extents for the area we're creating a hole + * for. So if we are truncating this file to a larger size we need to insert + * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for + * the range between oldsize and size + */ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) { struct btrfs_trans_handle *trans; -- cgit v1.2.3-70-g09d2 From 22a94d44bd6876a90630338229da6c0436d46593 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Mar 2011 16:47:17 -0400 Subject: Btrfs: add checks to verify dir items are correct We need to make sure the dir items we get are valid dir items. So any time we try and read one check it with verify_dir_item, which will do various sanity checks to make sure it looks sane. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/dir-item.c | 35 +++++++++++++++++++++++++++++++++++ fs/btrfs/inode.c | 3 +++ fs/btrfs/tree-log.c | 7 +++++++ fs/btrfs/xattr.c | 2 ++ 5 files changed, 50 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 841330f3d68..6036fdb88c5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2393,6 +2393,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f0cad5ae5be..02c97ad61b6 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -377,6 +377,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + if (verify_dir_item(root, leaf, dir_item)) + return NULL; + total_len = btrfs_item_size_nr(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + @@ -429,3 +432,35 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, } return ret; } + +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item) +{ + u16 namelen = BTRFS_NAME_LEN; + u8 type = btrfs_dir_type(leaf, dir_item); + + if (type >= BTRFS_FT_MAX) { + printk(KERN_CRIT "btrfs: invalid dir item type: %d\n", + (int)type); + return 1; + } + + if (type == BTRFS_FT_XATTR) + namelen = XATTR_NAME_MAX; + + if (btrfs_dir_name_len(leaf, dir_item) > namelen) { + printk(KERN_CRIT "btrfS: invalid dir item name len: %u\n", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ + if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) { + printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + return 0; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 888dbdb3b12..e010000d4bc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4272,6 +4272,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; + if (verify_dir_item(root, leaf, di)) + break; + name_len = btrfs_dir_name_len(leaf, di); if (name_len <= sizeof(tmp_name)) { name_ptr = tmp_name; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a4bbb854dfd..429cfcfadf9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1286,6 +1286,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) + return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); BUG_ON(ret); @@ -1412,6 +1414,11 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) { + ret = -EIO; + goto out; + } + name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); if (!name) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index a5776531dc2..e5d22f28095 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -242,6 +242,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) break; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + if (verify_dir_item(root, leaf, di)) + continue; name_len = btrfs_dir_name_len(leaf, di); total_size += name_len + 1; -- cgit v1.2.3-70-g09d2 From 98bc3149fad639c8f50c7110b961a2a2fe085eed Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 22 Mar 2011 11:00:46 -0400 Subject: Btrfs: don't allocate dip->csums when doing writes When doing direct writes we store the checksums in the ordered sum stuff in the ordered extent for writing them when the write completes, so we don't even use the dip->csums array. So if we're writing, don't bother allocating dip->csums since we won't use it anyway. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e010000d4bc..570cd44fe91 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5944,6 +5944,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int nr_pages = 0; u32 *csums = dip->csums; int ret = 0; + int write = rw & REQ_WRITE; bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) @@ -5980,7 +5981,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, goto out_err; } - if (!skip_sum) + /* Write's use the ordered csums */ + if (!write && !skip_sum) csums = csums + nr_pages; start_sector += submit_len >> 9; file_offset += submit_len; @@ -6048,7 +6050,8 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, } dip->csums = NULL; - if (!skip_sum) { + /* Write's use the ordered csum stuff, so we don't need dip->csums */ + if (!write && !skip_sum) { dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); if (!dip->csums) { kfree(dip); -- cgit v1.2.3-70-g09d2 From c0da7aa1a2d8fcafe271a7077599253c8ed94bb2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 22 Mar 2011 11:05:07 -0400 Subject: Btrfs: mark the bio with an error if we have a failure in dio I noticed that dio_end_io calls the appropriate endio function with an error, but the endio functions don't actually do anything with that error, they assume that if there was an error then the bio will not be uptodate. So if we had checksum failures we would never pass back EIO. So if there is an error in our endio functions make sure to clear the uptodate flag on the bio. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 570cd44fe91..e9813bd7d55 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5744,6 +5744,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) kfree(dip->csums); kfree(dip); + + /* If we had a csum failure make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); dio_end_io(bio, err); } @@ -5845,6 +5849,10 @@ out_done: kfree(dip->csums); kfree(dip); + + /* If we had an error make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); dio_end_io(bio, err); } -- cgit v1.2.3-70-g09d2 From 1abe9b8a138c9988ba8f7bfded6453649a31541f Mon Sep 17 00:00:00 2001 From: liubo Date: Thu, 24 Mar 2011 11:18:59 +0000 Subject: Btrfs: add initial tracepoint support for btrfs Tracepoints can provide insight into why btrfs hits bugs and be greatly helpful for debugging, e.g dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0 dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0 btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0) btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0) btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8 flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0) flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0) flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0) Here is what I have added: 1) ordere_extent: btrfs_ordered_extent_add btrfs_ordered_extent_remove btrfs_ordered_extent_start btrfs_ordered_extent_put These provide critical information to understand how ordered_extents are updated. 2) extent_map: btrfs_get_extent extent_map is used in both read and write cases, and it is useful for tracking how btrfs specific IO is running. 3) writepage: __extent_writepage btrfs_writepage_end_io_hook Pages are cirtical resourses and produce a lot of corner cases during writeback, so it is valuable to know how page is written to disk. 4) inode: btrfs_inode_new btrfs_inode_request btrfs_inode_evict These can show where and when a inode is created, when a inode is evicted. 5) sync: btrfs_sync_file btrfs_sync_fs These show sync arguments. 6) transaction: btrfs_transaction_commit In transaction based filesystem, it will be useful to know the generation and who does commit. 7) back reference and cow: btrfs_delayed_tree_ref btrfs_delayed_data_ref btrfs_delayed_ref_head btrfs_cow_block Btrfs natively supports back references, these tracepoints are helpful on understanding btrfs's COW mechanism. 8) chunk: btrfs_chunk_alloc btrfs_chunk_free Chunk is a link between physical offset and logical offset, and stands for space infomation in btrfs, and these are helpful on tracing space things. 9) reserved_extent: btrfs_reserved_extent_alloc btrfs_reserved_extent_free These can show how btrfs uses its space. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 3 + fs/btrfs/ctree.h | 1 + fs/btrfs/delayed-ref.c | 6 + fs/btrfs/extent-tree.c | 4 + fs/btrfs/extent_io.c | 2 + fs/btrfs/file.c | 1 + fs/btrfs/inode.c | 12 + fs/btrfs/ordered-data.c | 8 + fs/btrfs/super.c | 5 + fs/btrfs/transaction.c | 2 + fs/btrfs/volumes.c | 16 +- fs/btrfs/volumes.h | 11 + include/trace/events/btrfs.h | 667 +++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 727 insertions(+), 11 deletions(-) create mode 100644 include/trace/events/btrfs.h (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8680110f0a5..465b5d7d6b4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -535,6 +535,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0); + + trace_btrfs_cow_block(root, buf, *cow_ret); + return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0ee679b6c1b..9d0f59142af 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "extent_io.h" #include "extent_map.h" diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e807b143b85..bce28f65389 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -483,6 +483,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&head_ref->cluster); mutex_init(&head_ref->mutex); + trace_btrfs_delayed_ref_head(ref, head_ref, action); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { @@ -537,6 +539,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, } full_ref->level = level; + trace_btrfs_delayed_tree_ref(ref, full_ref, action); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { @@ -591,6 +595,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, full_ref->objectid = owner; full_ref->offset = offset; + trace_btrfs_delayed_data_ref(ref, full_ref, action); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd794c35a63..86ea471d380 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5412,6 +5412,8 @@ again: dump_space_info(sinfo, num_bytes, 1); } + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); + return ret; } @@ -5433,6 +5435,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) update_reserved_bytes(cache, len, 0, 1); btrfs_put_block_group(cache); + trace_btrfs_reserved_extent_free(root, start, len); + return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1bbd26b4fc5..77c65a0bea3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2192,6 +2192,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, else write_flags = WRITE; + trace___extent_writepage(page, inode, wbc); + WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a85b044cf39..656bc0a892b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1205,6 +1205,7 @@ int btrfs_sync_file(struct file *file, int datasync) int ret = 0; struct btrfs_trans_handle *trans; + trace_btrfs_sync_file(file, datasync); /* we wait first, since the writeback may change the inode */ root->log_batch++; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e9813bd7d55..eaa27148419 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1787,6 +1787,8 @@ out: static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); + ClearPagePrivate2(page); return btrfs_finish_ordered_io(page->mapping->host, start, end); } @@ -3718,6 +3720,8 @@ void btrfs_evict_inode(struct inode *inode) unsigned long nr; int ret; + trace_btrfs_inode_evict(inode); + truncate_inode_pages(&inode->i_data, 0); if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || root == root->fs_info->tree_root)) @@ -4510,6 +4514,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOMEM); if (dir) { + trace_btrfs_inode_request(dir); + ret = btrfs_set_inode_index(dir, index); if (ret) { iput(inode); @@ -4584,6 +4590,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, insert_inode_hash(inode); inode_tree_add(inode); + + trace_btrfs_inode_new(inode); + return inode; fail: if (dir) @@ -5261,6 +5270,9 @@ insert: } write_unlock(&em_tree->lock); out: + + trace_btrfs_get_extent(root, em); + if (path) btrfs_free_path(path); if (trans) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 083a5547737..a1c94042530 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -202,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); + trace_btrfs_ordered_extent_add(inode, entry); + spin_lock(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); @@ -387,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; + trace_btrfs_ordered_extent_put(entry->inode, entry); + if (atomic_dec_and_test(&entry->refs)) { while (!list_empty(&entry->list)) { cur = entry->list.next; @@ -420,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); + trace_btrfs_ordered_extent_remove(inode, entry); + /* * we have no more ordered extents for this inode and * no dirty pages. We can safely remove it from the @@ -585,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode, u64 start = entry->file_offset; u64 end = start + entry->len - 1; + trace_btrfs_ordered_extent_start(inode, entry); + /* * pages in the range can be dirty, clean or writeback. We * start IO on any dirty ones so the wait doesn't stall waiting diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d39a9895d93..2edfc039f09 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -52,6 +52,9 @@ #include "export.h" #include "compression.h" +#define CREATE_TRACE_POINTS +#include + static const struct super_operations btrfs_super_ops; static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, @@ -620,6 +623,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_root *root = btrfs_sb(sb); int ret; + trace_btrfs_sync_fs(wait); + if (!wait) { filemap_flush(root->fs_info->btree_inode->i_mapping); return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3d73c8d93bb..5b4bc685bb0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1389,6 +1389,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); + trace_btrfs_transaction_commit(root); + mutex_unlock(&root->fs_info->trans_mutex); if (current->journal_info == trans) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dd13eb81ee4..8ba3c9ebff9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -33,17 +33,6 @@ #include "volumes.h" #include "async-thread.h" -struct map_lookup { - u64 type; - int io_align; - int io_width; - int stripe_len; - int sector_size; - int num_stripes; - int sub_stripes; - struct btrfs_bio_stripe stripes[]; -}; - static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -1923,6 +1912,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, BUG_ON(ret); + trace_btrfs_chunk_free(root, map, chunk_offset, em->len); + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); BUG_ON(ret); @@ -2650,6 +2641,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, *num_bytes = chunk_bytes_by_type(type, calc_size, map->num_stripes, sub_stripes); + trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes); + em = alloc_extent_map(GFP_NOFS); if (!em) { ret = -ENOMEM; @@ -2758,6 +2751,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, item_size); BUG_ON(ret); } + kfree(chunk); return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7fb59d45fe8..7b38d0668b5 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -145,6 +145,17 @@ struct btrfs_device_info { u64 max_avail; }; +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + /* Used to sort the devices by max_avail(descending sort) */ int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h new file mode 100644 index 00000000000..f445cff66ab --- /dev/null +++ b/include/trace/events/btrfs.h @@ -0,0 +1,667 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM btrfs + +#if !defined(_TRACE_BTRFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BTRFS_H + +#include +#include + +struct btrfs_root; +struct btrfs_fs_info; +struct btrfs_inode; +struct extent_map; +struct btrfs_ordered_extent; +struct btrfs_delayed_ref_node; +struct btrfs_delayed_tree_ref; +struct btrfs_delayed_data_ref; +struct btrfs_delayed_ref_head; +struct map_lookup; +struct extent_buffer; + +#define show_ref_type(type) \ + __print_symbolic(type, \ + { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ + { BTRFS_EXTENT_DATA_REF_KEY, "EXTENT_DATA_REF" }, \ + { BTRFS_EXTENT_REF_V0_KEY, "EXTENT_REF_V0" }, \ + { BTRFS_SHARED_BLOCK_REF_KEY, "SHARED_BLOCK_REF" }, \ + { BTRFS_SHARED_DATA_REF_KEY, "SHARED_DATA_REF" }) + +#define __show_root_type(obj) \ + __print_symbolic(obj, \ + { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, \ + { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, \ + { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, \ + { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, \ + { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, \ + { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR" }, \ + { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, \ + { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, \ + { BTRFS_TREE_RELOC_OBJECTID, "TREE_RELOC" }, \ + { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }) + +#define show_root_type(obj) \ + obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ + (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" + +TRACE_EVENT(btrfs_transaction_commit, + + TP_PROTO(struct btrfs_root *root), + + TP_ARGS(root), + + TP_STRUCT__entry( + __field( u64, generation ) + __field( u64, root_objectid ) + ), + + TP_fast_assign( + __entry->generation = root->fs_info->generation; + __entry->root_objectid = root->root_key.objectid; + ), + + TP_printk("root = %llu(%s), gen = %llu", + show_root_type(__entry->root_objectid), + (unsigned long long)__entry->generation) +); + +DECLARE_EVENT_CLASS(btrfs__inode, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( blkcnt_t, blocks ) + __field( u64, disk_i_size ) + __field( u64, generation ) + __field( u64, last_trans ) + __field( u64, logged_trans ) + __field( u64, root_objectid ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->blocks = inode->i_blocks; + __entry->disk_i_size = BTRFS_I(inode)->disk_i_size; + __entry->generation = BTRFS_I(inode)->generation; + __entry->last_trans = BTRFS_I(inode)->last_trans; + __entry->logged_trans = BTRFS_I(inode)->logged_trans; + __entry->root_objectid = + BTRFS_I(inode)->root->root_key.objectid; + ), + + TP_printk("root = %llu(%s), gen = %llu, ino = %lu, blocks = %llu, " + "disk_i_size = %llu, last_trans = %llu, logged_trans = %llu", + show_root_type(__entry->root_objectid), + (unsigned long long)__entry->generation, + (unsigned long)__entry->ino, + (unsigned long long)__entry->blocks, + (unsigned long long)__entry->disk_i_size, + (unsigned long long)__entry->last_trans, + (unsigned long long)__entry->logged_trans) +); + +DEFINE_EVENT(btrfs__inode, btrfs_inode_new, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(btrfs__inode, btrfs_inode_request, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +#define __show_map_type(type) \ + __print_symbolic(type, \ + { EXTENT_MAP_LAST_BYTE, "LAST_BYTE" }, \ + { EXTENT_MAP_HOLE, "HOLE" }, \ + { EXTENT_MAP_INLINE, "INLINE" }, \ + { EXTENT_MAP_DELALLOC, "DELALLOC" }) + +#define show_map_type(type) \ + type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" : __show_map_type(type) + +#define show_map_flags(flag) \ + __print_flags(flag, "|", \ + { EXTENT_FLAG_PINNED, "PINNED" }, \ + { EXTENT_FLAG_COMPRESSED, "COMPRESSED" }, \ + { EXTENT_FLAG_VACANCY, "VACANCY" }, \ + { EXTENT_FLAG_PREALLOC, "PREALLOC" }) + +TRACE_EVENT(btrfs_get_extent, + + TP_PROTO(struct btrfs_root *root, struct extent_map *map), + + TP_ARGS(root, map), + + TP_STRUCT__entry( + __field( u64, root_objectid ) + __field( u64, start ) + __field( u64, len ) + __field( u64, orig_start ) + __field( u64, block_start ) + __field( u64, block_len ) + __field( unsigned long, flags ) + __field( int, refs ) + __field( unsigned int, compress_type ) + ), + + TP_fast_assign( + __entry->root_objectid = root->root_key.objectid; + __entry->start = map->start; + __entry->len = map->len; + __entry->orig_start = map->orig_start; + __entry->block_start = map->block_start; + __entry->block_len = map->block_len; + __entry->flags = map->flags; + __entry->refs = atomic_read(&map->refs); + __entry->compress_type = map->compress_type; + ), + + TP_printk("root = %llu(%s), start = %llu, len = %llu, " + "orig_start = %llu, block_start = %llu(%s), " + "block_len = %llu, flags = %s, refs = %u, " + "compress_type = %u", + show_root_type(__entry->root_objectid), + (unsigned long long)__entry->start, + (unsigned long long)__entry->len, + (unsigned long long)__entry->orig_start, + show_map_type(__entry->block_start), + (unsigned long long)__entry->block_len, + show_map_flags(__entry->flags), + __entry->refs, __entry->compress_type) +); + +#define show_ordered_flags(flags) \ + __print_symbolic(flags, \ + { BTRFS_ORDERED_IO_DONE, "IO_DONE" }, \ + { BTRFS_ORDERED_COMPLETE, "COMPLETE" }, \ + { BTRFS_ORDERED_NOCOW, "NOCOW" }, \ + { BTRFS_ORDERED_COMPRESSED, "COMPRESSED" }, \ + { BTRFS_ORDERED_PREALLOC, "PREALLOC" }, \ + { BTRFS_ORDERED_DIRECT, "DIRECT" }) + +DECLARE_EVENT_CLASS(btrfs__ordered_extent, + + TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( u64, file_offset ) + __field( u64, start ) + __field( u64, len ) + __field( u64, disk_len ) + __field( u64, bytes_left ) + __field( unsigned long, flags ) + __field( int, compress_type ) + __field( int, refs ) + __field( u64, root_objectid ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->file_offset = ordered->file_offset; + __entry->start = ordered->start; + __entry->len = ordered->len; + __entry->disk_len = ordered->disk_len; + __entry->bytes_left = ordered->bytes_left; + __entry->flags = ordered->flags; + __entry->compress_type = ordered->compress_type; + __entry->refs = atomic_read(&ordered->refs); + __entry->root_objectid = + BTRFS_I(inode)->root->root_key.objectid; + ), + + TP_printk("root = %llu(%s), ino = %llu, file_offset = %llu, " + "start = %llu, len = %llu, disk_len = %llu, " + "bytes_left = %llu, flags = %s, compress_type = %d, " + "refs = %d", + show_root_type(__entry->root_objectid), + (unsigned long long)__entry->ino, + (unsigned long long)__entry->file_offset, + (unsigned long long)__entry->start, + (unsigned long long)__entry->len, + (unsigned long long)__entry->disk_len, + (unsigned long long)__entry->bytes_left, + show_ordered_flags(__entry->flags), + __entry->compress_type, __entry->refs) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, + + TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, + + TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, + + TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put, + + TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DECLARE_EVENT_CLASS(btrfs__writepage, + + TP_PROTO(struct page *page, struct inode *inode, + struct writeback_control *wbc), + + TP_ARGS(page, inode, wbc), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( pgoff_t, index ) + __field( long, nr_to_write ) + __field( long, pages_skipped ) + __field( loff_t, range_start ) + __field( loff_t, range_end ) + __field( char, nonblocking ) + __field( char, for_kupdate ) + __field( char, for_reclaim ) + __field( char, range_cyclic ) + __field( pgoff_t, writeback_index ) + __field( u64, root_objectid ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->index = page->index; + __entry->nr_to_write = wbc->nr_to_write; + __entry->pages_skipped = wbc->pages_skipped; + __entry->range_start = wbc->range_start; + __entry->range_end = wbc->range_end; + __entry->nonblocking = wbc->nonblocking; + __entry->for_kupdate = wbc->for_kupdate; + __entry->for_reclaim = wbc->for_reclaim; + __entry->range_cyclic = wbc->range_cyclic; + __entry->writeback_index = inode->i_mapping->writeback_index; + __entry->root_objectid = + BTRFS_I(inode)->root->root_key.objectid; + ), + + TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " + "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " + "range_end = %llu, nonblocking = %d, for_kupdate = %d, " + "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", + show_root_type(__entry->root_objectid), + (unsigned long)__entry->ino, __entry->index, + __entry->nr_to_write, __entry->pages_skipped, + __entry->range_start, __entry->range_end, + __entry->nonblocking, __entry->for_kupdate, + __entry->for_reclaim, __entry->range_cyclic, + (unsigned long)__entry->writeback_index) +); + +DEFINE_EVENT(btrfs__writepage, __extent_writepage, + + TP_PROTO(struct page *page, struct inode *inode, + struct writeback_control *wbc), + + TP_ARGS(page, inode, wbc) +); + +TRACE_EVENT(btrfs_writepage_end_io_hook, + + TP_PROTO(struct page *page, u64 start, u64 end, int uptodate), + + TP_ARGS(page, start, end, uptodate), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( pgoff_t, index ) + __field( u64, start ) + __field( u64, end ) + __field( int, uptodate ) + __field( u64, root_objectid ) + ), + + TP_fast_assign( + __entry->ino = page->mapping->host->i_ino; + __entry->index = page->index; + __entry->start = start; + __entry->end = end; + __entry->uptodate = uptodate; + __entry->root_objectid = + BTRFS_I(page->mapping->host)->root->root_key.objectid; + ), + + TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, start = %llu, " + "end = %llu, uptodate = %d", + show_root_type(__entry->root_objectid), + (unsigned long)__entry->ino, (unsigned long)__entry->index, + (unsigned long long)__entry->start, + (unsigned long long)__entry->end, __entry->uptodate) +); + +TRACE_EVENT(btrfs_sync_file, + + TP_PROTO(struct file *file, int datasync), + + TP_ARGS(file, datasync), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( ino_t, parent ) + __field( int, datasync ) + __field( u64, root_objectid ) + ), + + TP_fast_assign( + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + + __entry->ino = inode->i_ino; + __entry->parent = dentry->d_parent->d_inode->i_ino; + __entry->datasync = datasync; + __entry->root_objectid = + BTRFS_I(inode)->root->root_key.objectid; + ), + + TP_printk("root = %llu(%s), ino = %ld, parent = %ld, datasync = %d", + show_root_type(__entry->root_objectid), + (unsigned long)__entry->ino, (unsigned long)__entry->parent, + __entry->datasync) +); + +TRACE_EVENT(btrfs_sync_fs, + + TP_PROTO(int wait), + + TP_ARGS(wait), + + TP_STRUCT__entry( + __field( int, wait ) + ), + + TP_fast_assign( + __entry->wait = wait; + ), + + TP_printk("wait = %d", __entry->wait) +); + +#define show_ref_action(action) \ + __print_symbolic(action, \ + { BTRFS_ADD_DELAYED_REF, "ADD_DELAYED_REF" }, \ + { BTRFS_DROP_DELAYED_REF, "DROP_DELAYED_REF" }, \ + { BTRFS_ADD_DELAYED_EXTENT, "ADD_DELAYED_EXTENT" }, \ + { BTRFS_UPDATE_DELAYED_HEAD, "UPDATE_DELAYED_HEAD" }) + + +TRACE_EVENT(btrfs_delayed_tree_ref, + + TP_PROTO(struct btrfs_delayed_ref_node *ref, + struct btrfs_delayed_tree_ref *full_ref, + int action), + + TP_ARGS(ref, full_ref, action), + + TP_STRUCT__entry( + __field( u64, bytenr ) + __field( u64, num_bytes ) + __field( int, action ) + __field( u64, parent ) + __field( u64, ref_root ) + __field( int, level ) + __field( int, type ) + ), + + TP_fast_assign( + __entry->bytenr = ref->bytenr; + __entry->num_bytes = ref->num_bytes; + __entry->action = action; + __entry->parent = full_ref->parent; + __entry->ref_root = full_ref->root; + __entry->level = full_ref->level; + __entry->type = ref->type; + ), + + TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " + "parent = %llu(%s), ref_root = %llu(%s), level = %d, " + "type = %s", + (unsigned long long)__entry->bytenr, + (unsigned long long)__entry->num_bytes, + show_ref_action(__entry->action), + show_root_type(__entry->parent), + show_root_type(__entry->ref_root), + __entry->level, show_ref_type(__entry->type)) +); + +TRACE_EVENT(btrfs_delayed_data_ref, + + TP_PROTO(struct btrfs_delayed_ref_node *ref, + struct btrfs_delayed_data_ref *full_ref, + int action), + + TP_ARGS(ref, full_ref, action), + + TP_STRUCT__entry( + __field( u64, bytenr ) + __field( u64, num_bytes ) + __field( int, action ) + __field( u64, parent ) + __field( u64, ref_root ) + __field( u64, owner ) + __field( u64, offset ) + __field( int, type ) + ), + + TP_fast_assign( + __entry->bytenr = ref->bytenr; + __entry->num_bytes = ref->num_bytes; + __entry->action = action; + __entry->parent = full_ref->parent; + __entry->ref_root = full_ref->root; + __entry->owner = full_ref->objectid; + __entry->offset = full_ref->offset; + __entry->type = ref->type; + ), + + TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, " + "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, " + "offset = %llu, type = %s", + (unsigned long long)__entry->bytenr, + (unsigned long long)__entry->num_bytes, + show_ref_action(__entry->action), + show_root_type(__entry->parent), + show_root_type(__entry->ref_root), + (unsigned long long)__entry->owner, + (unsigned long long)__entry->offset, + show_ref_type(__entry->type)) +); + +TRACE_EVENT(btrfs_delayed_ref_head, + + TP_PROTO(struct btrfs_delayed_ref_node *ref, + struct btrfs_delayed_ref_head *head_ref, + int action), + + TP_ARGS(ref, head_ref, action), + + TP_STRUCT__entry( + __field( u64, bytenr ) + __field( u64, num_bytes ) + __field( int, action ) + __field( int, is_data ) + ), + + TP_fast_assign( + __entry->bytenr = ref->bytenr; + __entry->num_bytes = ref->num_bytes; + __entry->action = action; + __entry->is_data = head_ref->is_data; + ), + + TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, is_data = %d", + (unsigned long long)__entry->bytenr, + (unsigned long long)__entry->num_bytes, + show_ref_action(__entry->action), + __entry->is_data) +); + +#define show_chunk_type(type) \ + __print_flags(type, "|", \ + { BTRFS_BLOCK_GROUP_DATA, "DATA" }, \ + { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ + { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ + { BTRFS_BLOCK_GROUP_RAID0, "RAID0" }, \ + { BTRFS_BLOCK_GROUP_RAID1, "RAID1" }, \ + { BTRFS_BLOCK_GROUP_DUP, "DUP" }, \ + { BTRFS_BLOCK_GROUP_RAID10, "RAID10"}) + +DECLARE_EVENT_CLASS(btrfs__chunk, + + TP_PROTO(struct btrfs_root *root, struct map_lookup *map, + u64 offset, u64 size), + + TP_ARGS(root, map, offset, size), + + TP_STRUCT__entry( + __field( int, num_stripes ) + __field( u64, type ) + __field( int, sub_stripes ) + __field( u64, offset ) + __field( u64, size ) + __field( u64, root_objectid ) + ), + + TP_fast_assign( + __entry->num_stripes = map->num_stripes; + __entry->type = map->type; + __entry->sub_stripes = map->sub_stripes; + __entry->offset = offset; + __entry->size = size; + __entry->root_objectid = root->root_key.objectid; + ), + + TP_printk("root = %llu(%s), offset = %llu, size = %llu, " + "num_stripes = %d, sub_stripes = %d, type = %s", + show_root_type(__entry->root_objectid), + (unsigned long long)__entry->offset, + (unsigned long long)__entry->size, + __entry->num_stripes, __entry->sub_stripes, + show_chunk_type(__entry->type)) +); + +DEFINE_EVENT(btrfs__chunk, btrfs_chunk_alloc, + + TP_PROTO(struct btrfs_root *root, struct map_lookup *map, + u64 offset, u64 size), + + TP_ARGS(root, map, offset, size) +); + +DEFINE_EVENT(btrfs__chunk, btrfs_chunk_free, + + TP_PROTO(struct btrfs_root *root, struct map_lookup *map, + u64 offset, u64 size), + + TP_ARGS(root, map, offset, size) +); + +TRACE_EVENT(btrfs_cow_block, + + TP_PROTO(struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow), + + TP_ARGS(root, buf, cow), + + TP_STRUCT__entry( + __field( u64, root_objectid ) + __field( u64, buf_start ) + __field( int, refs ) + __field( u64, cow_start ) + __field( int, buf_level ) + __field( int, cow_level ) + ), + + TP_fast_assign( + __entry->root_objectid = root->root_key.objectid; + __entry->buf_start = buf->start; + __entry->refs = atomic_read(&buf->refs); + __entry->cow_start = cow->start; + __entry->buf_level = btrfs_header_level(buf); + __entry->cow_level = btrfs_header_level(cow); + ), + + TP_printk("root = %llu(%s), refs = %d, orig_buf = %llu " + "(orig_level = %d), cow_buf = %llu (cow_level = %d)", + show_root_type(__entry->root_objectid), + __entry->refs, + (unsigned long long)__entry->buf_start, + __entry->buf_level, + (unsigned long long)__entry->cow_start, + __entry->cow_level) +); + +DECLARE_EVENT_CLASS(btrfs__reserved_extent, + + TP_PROTO(struct btrfs_root *root, u64 start, u64 len), + + TP_ARGS(root, start, len), + + TP_STRUCT__entry( + __field( u64, root_objectid ) + __field( u64, start ) + __field( u64, len ) + ), + + TP_fast_assign( + __entry->root_objectid = root->root_key.objectid; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("root = %llu(%s), start = %llu, len = %llu", + show_root_type(__entry->root_objectid), + (unsigned long long)__entry->start, + (unsigned long long)__entry->len) +); + +DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_alloc, + + TP_PROTO(struct btrfs_root *root, u64 start, u64 len), + + TP_ARGS(root, start, len) +); + +DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, + + TP_PROTO(struct btrfs_root *root, u64 start, u64 len), + + TP_ARGS(root, start, len) +); + +#endif /* _TRACE_BTRFS_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3-70-g09d2 From 75e7cb7fe0c391561bd3af36515be3f3c64a04c6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 22 Mar 2011 10:12:20 +0000 Subject: Btrfs: Per file/directory controls for COW and compression Data compression and data cow are controlled across the entire FS by mount options right now. ioctls are needed to set this on a per file or per directory basis. This has been proposed previously, but VFS developers wanted us to use generic ioctls rather than btrfs-specific ones. According to Chris's comment, there should be just one true compression method(probably LZO) stored in the super. However, before this, we would wait for that one method is stable enough to be adopted into the super. So I list it as a long term goal, and just store it in ram today. After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to control file and directory's datacow and compression attribute. NOTE: - The compression type is selected by such rules: If we mount btrfs with compress options, ie, zlib/lzo, the type is it. Otherwise, we'll use the default compress type (zlib today). v1->v2: - rebase to the latest btrfs. v2->v3: - fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW will be screwed by inheritance from parent directory. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 6 ++++++ fs/btrfs/inode.c | 31 ++++++++++++++++++++++++++++--- fs/btrfs/ioctl.c | 41 +++++++++++++++++++++++++++++++++++++---- 4 files changed, 72 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9d0f59142af..8302ecd4197 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1282,6 +1282,7 @@ struct btrfs_root { #define BTRFS_INODE_NODUMP (1 << 8) #define BTRFS_INODE_NOATIME (1 << 9) #define BTRFS_INODE_DIRSYNC (1 << 10) +#define BTRFS_INODE_COMPRESS (1 << 11) /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2bdb124333a..125639ddaff 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1854,6 +1854,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + /* + * In the long term, we'll store the compression type in the super + * block, and it'll be used for per file compression control. + */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + ret = btrfs_parse_options(tree_root, options); if (ret) { err = ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eaa27148419..7a7a202b82a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -384,7 +384,8 @@ again: */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && (btrfs_test_opt(root, COMPRESS) || - (BTRFS_I(inode)->force_compress))) { + (BTRFS_I(inode)->force_compress) || + (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); @@ -1256,7 +1257,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress)) + !(BTRFS_I(inode)->force_compress) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); else @@ -4584,7 +4586,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if ((mode & S_IFREG)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(root, NODATACOW) || + (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; } @@ -6866,6 +6869,26 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } +/* + * If a file is moved, it will inherit the cow and compression flags of the new + * directory. + */ +static void fixup_inode_flags(struct inode *dir, struct inode *inode) +{ + struct btrfs_inode *b_dir = BTRFS_I(dir); + struct btrfs_inode *b_inode = BTRFS_I(inode); + + if (b_dir->flags & BTRFS_INODE_NODATACOW) + b_inode->flags |= BTRFS_INODE_NODATACOW; + else + b_inode->flags &= ~BTRFS_INODE_NODATACOW; + + if (b_dir->flags & BTRFS_INODE_COMPRESS) + b_inode->flags |= BTRFS_INODE_COMPRESS; + else + b_inode->flags &= ~BTRFS_INODE_COMPRESS; +} + static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -6999,6 +7022,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + fixup_inode_flags(new_dir, old_inode); + ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 88d3cb2eaf7..32c980ae0f1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -138,6 +138,24 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg) return 0; } +static int check_flags(unsigned int flags) +{ + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL | \ + FS_NOCOMP_FL | FS_COMPR_FL | \ + FS_NOCOW_FL | FS_COW_FL)) + return -EOPNOTSUPP; + + if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) + return -EINVAL; + + if ((flags & FS_NOCOW_FL) && (flags & FS_COW_FL)) + return -EINVAL; + + return 0; +} + static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { struct inode *inode = file->f_path.dentry->d_inode; @@ -153,10 +171,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ - FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL | FS_DIRSYNC_FL)) - return -EOPNOTSUPP; + ret = check_flags(flags); + if (ret) + return ret; if (!is_owner_or_cap(inode)) return -EACCES; @@ -201,6 +218,22 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) else ip->flags &= ~BTRFS_INODE_DIRSYNC; + /* + * The COMPRESS flag can only be changed by users, while the NOCOMPRESS + * flag may be changed automatically if compression code won't make + * things smaller. + */ + if (flags & FS_NOCOMP_FL) { + ip->flags &= ~BTRFS_INODE_COMPRESS; + ip->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & FS_COMPR_FL) { + ip->flags |= BTRFS_INODE_COMPRESS; + ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + } + if (flags & FS_NOCOW_FL) + ip->flags |= BTRFS_INODE_NODATACOW; + else if (flags & FS_COW_FL) + ip->flags &= ~BTRFS_INODE_NODATACOW; trans = btrfs_join_transaction(root, 1); BUG_ON(IS_ERR(trans)); -- cgit v1.2.3-70-g09d2 From 3ab3564f018b9b265d0258e4a231794bacd5ad85 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 22 Mar 2011 17:20:26 +0000 Subject: btrfs: return EXDEV when linking from different subvolumes btrfs_link returns EPERM if a cross-subvolume link is attempted. However, in this case I believe EXDEV to be the more appropriate value. >From the link(2) man page: EXDEV oldpath and newpath are not on the same mounted file system. (Linux permits a file system to be mounted at multiple points, but link() does not work across different mount points, even if the same file system is mounted on both.) This matters because an application may have different behaviors based on return codes. Signed-off-by: Mark Fasheh Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7a7a202b82a..67fd6e9552d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4817,7 +4817,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, /* do not allow sys_link's with other subvols of the same device */ if (root->objectid != BTRFS_I(inode)->root->objectid) - return -EPERM; + return -EXDEV; btrfs_inc_nlink(inode); inode->i_ctime = CURRENT_TIME; -- cgit v1.2.3-70-g09d2 From dac97e516c617f9c797f64b0224050b70aea30c7 Mon Sep 17 00:00:00 2001 From: Yoshinori Sano Date: Tue, 15 Feb 2011 12:01:42 +0000 Subject: Btrfs: fix uncheck memory allocations To make Btrfs code more robust, several return value checks where memory allocation can fail are introduced. I use BUG_ON where I don't know how to handle the error properly, which increases the number of using the notorious BUG_ON, though. Signed-off-by: Yoshinori Sano Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 6 ++++++ fs/btrfs/extent-tree.c | 4 ++++ fs/btrfs/inode.c | 2 ++ 3 files changed, 12 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4d2110eafe2..992a4b92083 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -340,6 +340,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + return -ENOMEM; atomic_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -354,6 +356,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + if(!bio) { + kfree(cb); + return -ENOMEM; + } bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; atomic_inc(&cb->pending_bios); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1efeda3b2f6..cd0b69f5737 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6978,6 +6978,10 @@ static noinline int get_new_locations(struct inode *reloc_inode, struct disk_extent *old = exts; max *= 2; exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); + if (!exts) { + ret = -ENOMEM; + goto out; + } memcpy(exts, old, sizeof(*exts) * nr); if (old != *extents) kfree(old); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67fd6e9552d..f739b256967 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -290,6 +290,7 @@ static noinline int add_async_extent(struct async_cow *cow, struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + BUG_ON(!async_extent); async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; @@ -388,6 +389,7 @@ again: (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + BUG_ON(!pages); if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; -- cgit v1.2.3-70-g09d2 From c2db1073fdf9757e6fd8b4a59d15b6ecc7a2af8a Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 1 Mar 2011 06:48:31 +0000 Subject: Btrfs: check return value of btrfs_alloc_path() Adding the check on the return value of btrfs_alloc_path() to several places. And, some of callers are modified by this change. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 11 +++++++---- fs/btrfs/dir-item.c | 10 ++++++---- fs/btrfs/file-item.c | 2 ++ fs/btrfs/inode.c | 18 ++++++++++++------ 4 files changed, 27 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 992a4b92083..41d1d7c70e2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -663,8 +663,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, atomic_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - btrfs_lookup_bio_sums(root, inode, comp_bio, - sums); + ret = btrfs_lookup_bio_sums(root, inode, + comp_bio, sums); + BUG_ON(ret); } sums += (comp_bio->bi_size + root->sectorsize - 1) / root->sectorsize; @@ -689,8 +690,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); BUG_ON(ret); - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + BUG_ON(ret); + } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); BUG_ON(ret); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 02c97ad61b6..c62f02f6ae6 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -151,7 +151,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root ret = PTR_ERR(dir_item); if (ret == -EEXIST) goto second_insert; - goto out; + goto out_free; } leaf = path->nodes[0]; @@ -170,7 +170,7 @@ second_insert: /* FIXME, use some real flag for selecting the extra index */ if (root == root->fs_info->tree_root) { ret = 0; - goto out; + goto out_free; } btrfs_release_path(root, path); @@ -180,7 +180,7 @@ second_insert: name, name_len); if (IS_ERR(dir_item)) { ret2 = PTR_ERR(dir_item); - goto out; + goto out_free; } leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, location); @@ -192,7 +192,9 @@ second_insert: name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name, name_ptr, name_len); btrfs_mark_buffer_dirty(leaf); -out: + +out_free: + btrfs_free_path(path); if (ret) return ret; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a2134195a85..a6a9d4e8b49 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -170,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; if (bio->bi_size > PAGE_CACHE_SIZE * 8) path->reada = 2; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f739b256967..04e9fffee8c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1467,8 +1467,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (bio_flags & EXTENT_BIO_COMPRESSED) { return btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); - } else if (!skip_sum) - btrfs_lookup_bio_sums(root, inode, bio, NULL); + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); + if (ret) + return ret; + } goto mapit; } else if (!skip_sum) { /* csum items have already been cloned */ @@ -1903,10 +1906,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, else rw = READ; - BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, + ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, failrec->last_mirror, failrec->bio_flags, 0); - return 0; + return ret; } /* @@ -5943,9 +5946,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; - } else if (!skip_sum) - btrfs_lookup_bio_sums_dio(root, inode, bio, + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset, csums); + if (ret) + goto err; + } ret = btrfs_map_bio(root, rw, bio, 0, 1); err: -- cgit v1.2.3-70-g09d2 From 92986796d84ef939e304099dece32572a755b280 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 4 Mar 2011 17:14:37 +0000 Subject: btrfs: don't mess with i_nlink of unlocked inode in rename() old_inode is not locked; it's not safe to play with its link count. Instead of bumping it and calling btrfs_unlink_inode(), add a variant of the latter that does not do btrfs_drop_nlink()/ btrfs_update_inode(), call it instead of btrfs_inc_nlink()/ btrfs_unlink_inode() and do btrfs_update_inode() ourselves. Signed-off-by: Al Viro Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 04e9fffee8c..4822b313278 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2664,10 +2664,10 @@ failed: * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory */ -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, struct inode *inode, - const char *name, int name_len) +static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) { struct btrfs_path *path; int ret = 0; @@ -2739,12 +2739,25 @@ err: btrfs_i_size_write(dir, dir->i_size - name_len * 2); inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; btrfs_update_inode(trans, root, dir); - btrfs_drop_nlink(inode); - ret = btrfs_update_inode(trans, root, inode); out: return ret; } +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + int ret; + ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + if (!ret) { + btrfs_drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + } + return ret; +} + + /* helper to check if there is any shared block in the path */ static int check_path_shared(struct btrfs_root *root, struct btrfs_path *path) @@ -6999,11 +7012,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.name, old_dentry->d_name.len); } else { - btrfs_inc_nlink(old_dentry->d_inode); - ret = btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); } BUG_ON(ret); -- cgit v1.2.3-70-g09d2 From c055e99eea6e4f614267632fac546e7896c0227b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 4 Mar 2011 17:15:18 +0000 Subject: btrfs: check link counter overflow in link(2) Signed-off-by: Al Viro Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4822b313278..04babaf31a3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4837,6 +4837,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (root->objectid != BTRFS_I(inode)->root->objectid) return -EXDEV; + if (inode->i_nlink == ~0U) + return -EMLINK; + btrfs_inc_nlink(inode); inode->i_ctime = CURRENT_TIME; -- cgit v1.2.3-70-g09d2 From 1561deda687eef0e95065f1268d680ddc5976ee7 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Sun, 27 Mar 2011 16:07:36 +0800 Subject: btrfs: fix possible deadlock by clearing __GFP_FS flag Using the GFP_HIGHUSER_MOVABLE flag to allocate the metadata's page may cause deadlock. Task1 open() ... btrfs_search_slot() ... btrfs_cow_block() ... alloc_page() wait for reclaiming shrink_slab() ... shrink_icache_memory() ... btrfs_evict_inode() ... btrfs_search_slot() If the path is locked by task1, the deadlock happens. So the btree's page cache is different with the file's page cache, it can not allocate pages by GFP_HIGHUSER_MOVABLE flag, we must clear __GFP_FS flag in GFP_HIGHUSER_MOVABLE flag. Reported-by: Itaru Kitayama Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 ++ fs/btrfs/inode.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b3fc8475870..5cf3aa7b125 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1724,6 +1724,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_bdi; } + fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 04babaf31a3..06274186b29 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2536,6 +2536,8 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + if (location.objectid == BTRFS_FREE_SPACE_OBJECTID) + inode->i_mapping->flags &= ~__GFP_FS; /* * try to precache a NULL acl entry for files that don't have @@ -4084,7 +4086,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); - inode_tree_add(inode); unlock_new_inode(inode); if (new) -- cgit v1.2.3-70-g09d2