diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2015-02-10 11:35:36 -0800 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2015-02-10 11:35:36 -0800 |
commit | 4ba24fef3eb3b142197135223b90ced2f319cd53 (patch) | |
tree | a20c125b27740ec7b4c761b11d801108e1b316b2 /fs/btrfs/file.c | |
parent | 47c1ffb2b6b630894e9a16442611c056ab21c057 (diff) | |
parent | 98a4a59ee31a12105a2b84f5b8b515ac2cb208ef (diff) |
Merge branch 'next' into for-linus
Prepare first round of input updates for 3.20.
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 194 |
1 files changed, 147 insertions, 47 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ff1cc0399b9..e4090259569 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, /* get the inode */ key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; index = srcu_read_lock(&fs_info->subvol_srcu); @@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, } key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { @@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, if (unlikely(copied == 0)) break; - if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + if (copied < PAGE_CACHE_SIZE - offset) { offset += copied; } else { pg++; @@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, u64 num_bytes; int ret; - ret = btrfs_start_nocow_write(root); + ret = btrfs_start_write_no_snapshoting(root); if (!ret) return -ENOSPC; @@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); @@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, bool force_page_uptodate = false; bool need_unlock; - nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / - (sizeof(struct page *))); + nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); @@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * our prealloc extent may be smaller than * write_bytes, so scale down. */ - num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); reserve_bytes = num_pages << PAGE_CACHE_SHIFT; ret = 0; } else { @@ -1545,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, btrfs_free_reserved_data_space(inode, reserve_bytes); else - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); break; } @@ -1590,9 +1588,8 @@ again: dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = (copied + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + dirty_pages = DIV_ROUND_UP(copied + offset, + PAGE_CACHE_SIZE); } /* @@ -1635,7 +1632,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); @@ -1653,7 +1650,7 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root); pos += copied; @@ -1664,7 +1661,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { btrfs_delalloc_release_space(inode, release_bytes); @@ -1679,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t pos) { struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); ssize_t written; ssize_t written_buffered; loff_t endbyte; @@ -1695,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, err = written_buffered; goto out; } + /* + * Ensure all data is persisted. We want the next direct IO read to be + * able to read what was just written. + */ endbyte = pos + written_buffered - 1; - err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + err = btrfs_fdatawrite_range(inode, pos, endbyte); + if (err) + goto out; + err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); if (err) goto out; written += written_buffered; @@ -1795,7 +1800,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (unlikely(file->f_flags & O_DIRECT)) { + if (file->f_flags & O_DIRECT) { num_written = __btrfs_direct_write(iocb, from, pos); } else { num_written = __btrfs_buffered_write(file, from, pos); @@ -1852,6 +1857,17 @@ int btrfs_release_file(struct inode *inode, struct file *filp) return 0; } +static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + atomic_inc(&BTRFS_I(inode)->sync_writers); + ret = btrfs_fdatawrite_range(inode, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); + + return ret; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -1881,30 +1897,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * multi-task, and make the performance up. See * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ - atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - atomic_dec(&BTRFS_I(inode)->sync_writers); + ret = start_ordered_ops(inode, start, end); if (ret) return ret; mutex_lock(&inode->i_mutex); - - /* - * We flush the dirty pages again to avoid some dirty pages in the - * range being left. - */ atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We might have have had more pages made dirty after calling + * start_ordered_ops and before acquiring the inode's i_mutex. + */ if (full_sync) { + /* + * For a full sync, we need to make sure any ordered operations + * start and finish before we start logging the inode, so that + * all extents are persisted and the respective file extent + * items are in the fs/subvol btree. + */ ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - goto out; - } + } else { + /* + * Start any new ordered operations before starting to log the + * inode. We will wait for them to finish in btrfs_sync_log(). + * + * Right before acquiring the inode's mutex, we might have new + * writes dirtying pages, which won't immediately start the + * respective ordered operations - that is done through the + * fill_delalloc callbacks invoked from the writepage and + * writepages address space operations. So make sure we start + * all ordered operations before starting to log our inode. Not + * doing this means that while logging the inode, writeback + * could start and invoke writepage/writepages, which would call + * the fill_delalloc callbacks (cow_file_range, + * submit_compressed_extents). These callbacks add first an + * extent map to the modified list of extents and then create + * the respective ordered operation, which means in + * tree-log.c:btrfs_log_inode() we might capture all existing + * ordered operations (with btrfs_get_logged_extents()) before + * the fill_delalloc callback adds its ordered operation, and by + * the time we visit the modified list of extent maps (with + * btrfs_log_changed_extents()), we see and process the extent + * map they created. We then use the extent map to construct a + * file extent item for logging without waiting for the + * respective ordered operation to finish - this file extent + * item points to a disk location that might not have yet been + * written to, containing random data - so after a crash a log + * replay will make our inode have file extent items that point + * to disk locations containing invalid data, as we returned + * success to userspace without waiting for the respective + * ordered operation to finish, because it wasn't captured by + * btrfs_get_logged_extents(). + */ + ret = start_ordered_ops(inode, start, end); + } + if (ret) { + mutex_unlock(&inode->i_mutex); + goto out; } atomic_inc(&root->log_batch); @@ -1984,6 +2034,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ mutex_unlock(&inode->i_mutex); + /* + * If any of the ordered extents had an error, just return it to user + * space, so that the application knows some writes didn't succeed and + * can take proper action (retry for e.g.). Blindly committing the + * transaction in this case, would fool userspace that everything was + * successful. And we also want to make sure our log doesn't contain + * file extent items pointing to extents that weren't fully written to - + * just like in the non fast fsync path, where we check for the ordered + * operation's error flag before writing to the log tree and return -EIO + * if any of them had this flag set (btrfs_wait_ordered_range) - + * therefore we need to check for errors in the ordered operations, + * which are indicated by ctx.io_err. + */ + if (ctx.io_err) { + btrfs_end_transaction(trans, root); + ret = ctx.io_err; + goto out; + } + if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); @@ -2621,23 +2690,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - u64 lockstart = *offset; - u64 lockend = i_size_read(inode); - u64 start = *offset; - u64 len = i_size_read(inode); + u64 lockstart; + u64 lockend; + u64 start; + u64 len; int ret = 0; - lockend = max_t(u64, root->sectorsize, lockend); + if (inode->i_size == 0) + return -ENXIO; + + /* + * *offset can be negative, in this case we start finding DATA/HOLE from + * the very start of the file. + */ + start = max_t(loff_t, 0, *offset); + + lockstart = round_down(start, root->sectorsize); + lockend = round_up(i_size_read(inode), root->sectorsize); if (lockend <= lockstart) lockend = lockstart + root->sectorsize; - lockend--; len = lockend - lockstart + 1; - len = max_t(u64, len, root->sectorsize); - if (inode->i_size == 0) - return -ENXIO; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, &cached_state); @@ -2741,3 +2815,29 @@ int btrfs_auto_defrag_init(void) return 0; } + +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + + return ret; +} |