diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-26 11:34:40 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-26 11:34:40 -0700 |
commit | 2ac232f37fa0e8551856a575fe299c47b65b4d66 (patch) | |
tree | 58ff15ecdbc383415a82ea678e5191db16a479f3 /fs/ext3/inode.c | |
parent | fa8f53ace4af9470d8414427cb3dc3c0ffc4f182 (diff) | |
parent | 5cf49d763eb141d236e92be6d4a0dc94e31fa886 (diff) |
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6:
jbd: change the field "b_cow_tid" of struct journal_head from type unsigned to tid_t
ext3.txt: update the links in the section "useful links" to the latest ones
ext3: Fix data corruption in inodes with journalled data
ext2: check xattr name_len before acquiring xattr_sem in ext2_xattr_get
ext3: Fix compilation with -DDX_DEBUG
quota: Remove unused declaration
jbd: Use WRITE_SYNC in journal checkpoint.
jbd: Fix oops in journal_remove_journal_head()
ext3: Return -EINVAL when start is beyond the end of fs in ext3_trim_fs()
ext3/ioctl.c: silence sparse warnings about different address spaces
ext3/ext4 Documentation: remove bh/nobh since it has been deprecated
ext3: Improve truncate error handling
ext3: use proper little-endian bitops
ext2: include fs.h into ext2_fs.h
ext3: Fix oops in ext3_try_to_allocate_with_rsv()
jbd: fix a bug of leaking jh->b_jcount
jbd: remove dependency on __GFP_NOFAIL
ext3: Convert ext3 to new truncate calling convention
jbd: Add fixed tracepoints
ext3: Add fixed tracepoints
Resolve conflicts in fs/ext3/fsync.c due to fsync locking push-down and
new fixed tracepoints.
Diffstat (limited to 'fs/ext3/inode.c')
-rw-r--r-- | fs/ext3/inode.c | 193 |
1 files changed, 136 insertions, 57 deletions
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2978a2a17a5..04da6acde85 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -38,10 +38,12 @@ #include <linux/bio.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <trace/events/ext3.h> #include "xattr.h" #include "acl.h" static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); /* * Test whether an inode is a fast symlink. @@ -70,6 +72,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext3_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -194,20 +197,47 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode) */ void ext3_evict_inode (struct inode *inode) { + struct ext3_inode_info *ei = EXT3_I(inode); struct ext3_block_alloc_info *rsv; handle_t *handle; int want_delete = 0; + trace_ext3_evict_inode(inode); if (!inode->i_nlink && !is_bad_inode(inode)) { dquot_initialize(inode); want_delete = 1; } + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); ext3_discard_reservation(inode); - rsv = EXT3_I(inode)->i_block_alloc_info; - EXT3_I(inode)->i_block_alloc_info = NULL; + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); @@ -231,15 +261,13 @@ void ext3_evict_inode (struct inode *inode) if (inode->i_blocks) ext3_truncate(inode); /* - * Kill off the orphan record which ext3_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext3_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. */ ext3_orphan_del(handle, inode); - EXT3_I(inode)->i_dtime = get_seconds(); + ei->i_dtime = get_seconds(); /* * One subtle ordering requirement: if anything has gone wrong @@ -842,6 +870,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, ext3_fsblk_t first_block = 0; + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -886,6 +915,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!create || err == -EIO) goto cleanup; + /* + * Block out ext3_truncate while we alter the tree + */ mutex_lock(&ei->truncate_mutex); /* @@ -934,9 +966,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, */ count = ext3_blks_to_allocate(partial, indirect_blks, maxblocks, blocks_to_boundary); - /* - * Block out ext3_truncate while we alter the tree - */ err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -970,6 +999,9 @@ cleanup: } BUFFER_TRACE(bh_result, "returned"); out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); return err; } @@ -1202,6 +1234,16 @@ static void ext3_truncate_failed_write(struct inode *inode) ext3_truncate(inode); } +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1217,6 +1259,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, * we allocate blocks but write fails for some reason */ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + trace_ext3_write_begin(inode, pos, len, flags); + index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1332,6 +1376,7 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; + trace_ext3_ordered_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1367,6 +1412,7 @@ static int ext3_writeback_write_end(struct file *file, struct inode *inode = file->f_mapping->host; int ret; + trace_ext3_writeback_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); update_file_sizes(inode, pos, copied); /* @@ -1391,10 +1437,12 @@ static int ext3_journalled_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); int ret = 0, ret2; int partial = 0; unsigned from, to; + trace_ext3_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1419,8 +1467,9 @@ static int ext3_journalled_write_end(struct file *file, if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_set_inode_state(inode, EXT3_STATE_JDATA); - if (inode->i_size > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = inode->i_size; + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1577,6 +1626,7 @@ static int ext3_ordered_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_ordered_writepage(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1647,6 +1697,7 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_writeback_writepage(page); if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { @@ -1689,6 +1740,7 @@ static int ext3_journalled_writepage(struct page *page, if (ext3_journal_current_handle()) goto no_write; + trace_ext3_journalled_writepage(page); handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1715,6 +1767,8 @@ static int ext3_journalled_writepage(struct page *page, if (ret == 0) ret = err; ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); unlock_page(page); } else { /* @@ -1739,6 +1793,7 @@ out_unlock: static int ext3_readpage(struct file *file, struct page *page) { + trace_ext3_readpage(page); return mpage_readpage(page, ext3_get_block); } @@ -1753,6 +1808,8 @@ static void ext3_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_invalidatepage(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ @@ -1766,6 +1823,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_releasepage(page); WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -1794,6 +1852,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_length(iov, nr_segs); int retries = 0; + trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + if (rw == WRITE) { loff_t final_size = offset + count; @@ -1827,7 +1887,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + ext3_truncate_failed_direct_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1841,7 +1901,7 @@ retry: /* This is really bad luck. We've written the data * but cannot extend i_size. Truncate allocated blocks * and pretend the write failed... */ - ext3_truncate(inode); + ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); goto out; } @@ -1867,6 +1927,8 @@ retry: ret = err; } out: + trace_ext3_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); return ret; } @@ -1949,17 +2011,24 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, length, pos; - struct inode *inode = mapping->host; + struct page *page; + handle_t *handle = NULL; struct buffer_head *bh; int err = 0; + /* Truncated on block boundary - nothing to do */ blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -2004,11 +2073,23 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); + goto unlock; + } + } + if (ext3_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext3_journal_get_write_access(handle, bh); if (err) - goto unlock; + goto stop; } zero_user(page, offset, length); @@ -2022,6 +2103,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, err = ext3_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } +stop: + if (handle) + ext3_journal_stop(handle); unlock: unlock_page(page); @@ -2390,8 +2474,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, int ext3_can_truncate(struct inode *inode) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -2435,7 +2517,6 @@ void ext3_truncate(struct inode *inode) struct ext3_inode_info *ei = EXT3_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -2443,7 +2524,8 @@ void ext3_truncate(struct inode *inode) int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; + + trace_ext3_truncate_enter(inode); if (!ext3_can_truncate(inode)) goto out_notrans; @@ -2451,37 +2533,12 @@ void ext3_truncate(struct inode *inode) if (inode->i_size == 0 && ext3_should_writeback_data(inode)) ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - /* - * We have to lock the EOF page here, because lock_page() nests - * outside journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - goto out_notrans; - } - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) goto out_notrans; - } last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - - if (page) - ext3_block_truncate_page(handle, page, mapping, inode->i_size); - n = ext3_block_to_path(inode, last_block, offsets, NULL); if (n == 0) goto out_stop; /* error */ @@ -2596,6 +2653,7 @@ out_stop: ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); return; out_notrans: /* @@ -2604,6 +2662,7 @@ out_notrans: */ if (inode->i_nlink) ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); } static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, @@ -2745,6 +2804,7 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ_META, bh); @@ -3229,18 +3289,36 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) } error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } EXT3_I(inode)->i_disksize = attr->ia_size; - rc = ext3_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } } if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - rc = vmtruncate(inode, attr->ia_size); - if (rc) - goto err_out; + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); } setattr_copy(inode, attr); @@ -3374,6 +3452,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) int err; might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); |