diff options
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 182 |
1 files changed, 117 insertions, 65 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1f2b99cb55e..a18ceabd99a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, /* get the inode */ key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; index = srcu_read_lock(&fs_info->subvol_srcu); @@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, } key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { @@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, if (unlikely(copied == 0)) break; - if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + if (copied < PAGE_CACHE_SIZE - offset) { offset += copied; } else { pg++; @@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, bool force_page_uptodate = false; bool need_unlock; - nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / - (sizeof(struct page *))); + nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); @@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * our prealloc extent may be smaller than * write_bytes, so scale down. */ - num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); reserve_bytes = num_pages << PAGE_CACHE_SHIFT; ret = 0; } else { @@ -1590,9 +1588,8 @@ again: dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = (copied + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + dirty_pages = DIV_ROUND_UP(copied + offset, + PAGE_CACHE_SIZE); } /* @@ -1653,7 +1650,7 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root); pos += copied; @@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (unlikely(file->f_flags & O_DIRECT)) { + if (file->f_flags & O_DIRECT) { num_written = __btrfs_direct_write(iocb, from, pos); } else { num_written = __btrfs_buffered_write(file, from, pos); @@ -1838,6 +1835,8 @@ out: int btrfs_release_file(struct inode *inode, struct file *filp) { + if (filp->private_data) + btrfs_ioctl_trans_end(filp); /* * ordered_data_close is set by settattr when we are about to truncate * a file from a non-zero size to a zero size. This tries to @@ -1845,29 +1844,25 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * application were using truncate to replace a file in place. */ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We need to block on a committing transaction to keep us from - * throwing a ordered operation on to the list and causing - * something like sync to deadlock trying to flush out this - * inode. - */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); - btrfs_end_transaction(trans, root); - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - } - if (filp->private_data) - btrfs_ioctl_trans_end(filp); return 0; } +static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + atomic_inc(&BTRFS_I(inode)->sync_writers); + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); + + return ret; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -1897,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * multi-task, and make the performance up. See * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ - atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - atomic_dec(&BTRFS_I(inode)->sync_writers); + ret = start_ordered_ops(inode, start, end); if (ret) return ret; mutex_lock(&inode->i_mutex); - - /* - * We flush the dirty pages again to avoid some dirty pages in the - * range being left. - */ atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We might have have had more pages made dirty after calling + * start_ordered_ops and before acquiring the inode's i_mutex. + */ if (full_sync) { + /* + * For a full sync, we need to make sure any ordered operations + * start and finish before we start logging the inode, so that + * all extents are persisted and the respective file extent + * items are in the fs/subvol btree. + */ ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - goto out; - } + } else { + /* + * Start any new ordered operations before starting to log the + * inode. We will wait for them to finish in btrfs_sync_log(). + * + * Right before acquiring the inode's mutex, we might have new + * writes dirtying pages, which won't immediately start the + * respective ordered operations - that is done through the + * fill_delalloc callbacks invoked from the writepage and + * writepages address space operations. So make sure we start + * all ordered operations before starting to log our inode. Not + * doing this means that while logging the inode, writeback + * could start and invoke writepage/writepages, which would call + * the fill_delalloc callbacks (cow_file_range, + * submit_compressed_extents). These callbacks add first an + * extent map to the modified list of extents and then create + * the respective ordered operation, which means in + * tree-log.c:btrfs_log_inode() we might capture all existing + * ordered operations (with btrfs_get_logged_extents()) before + * the fill_delalloc callback adds its ordered operation, and by + * the time we visit the modified list of extent maps (with + * btrfs_log_changed_extents()), we see and process the extent + * map they created. We then use the extent map to construct a + * file extent item for logging without waiting for the + * respective ordered operation to finish - this file extent + * item points to a disk location that might not have yet been + * written to, containing random data - so after a crash a log + * replay will make our inode have file extent items that point + * to disk locations containing invalid data, as we returned + * success to userspace without waiting for the respective + * ordered operation to finish, because it wasn't captured by + * btrfs_get_logged_extents(). + */ + ret = start_ordered_ops(inode, start, end); + } + if (ret) { + mutex_unlock(&inode->i_mutex); + goto out; } atomic_inc(&root->log_batch); @@ -1982,7 +2011,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(&ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -2000,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ mutex_unlock(&inode->i_mutex); + /* + * If any of the ordered extents had an error, just return it to user + * space, so that the application knows some writes didn't succeed and + * can take proper action (retry for e.g.). Blindly committing the + * transaction in this case, would fool userspace that everything was + * successful. And we also want to make sure our log doesn't contain + * file extent items pointing to extents that weren't fully written to - + * just like in the non fast fsync path, where we check for the ordered + * operation's error flag before writing to the log tree and return -EIO + * if any of them had this flag set (btrfs_wait_ordered_range) - + * therefore we need to check for errors in the ordered operations, + * which are indicated by ctx.io_err. + */ + if (ctx.io_err) { + btrfs_end_transaction(trans, root); + ret = ctx.io_err; + goto out; + } + if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); @@ -2112,10 +2160,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, goto out; } - if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { u64 num_bytes; - path->slots[0]++; key.offset = offset; btrfs_set_item_key_safe(root, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], @@ -2240,7 +2287,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); lockend = round_down(offset + len, BTRFS_I(inode)->root->sectorsize) - 1; same_page = ((offset >> PAGE_CACHE_SHIFT) == @@ -2301,7 +2348,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) tail_start + tail_len, 0, 1); if (ret) goto out_only_mutex; - } + } } } @@ -2638,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - u64 lockstart = *offset; - u64 lockend = i_size_read(inode); - u64 start = *offset; - u64 len = i_size_read(inode); + u64 lockstart; + u64 lockend; + u64 start; + u64 len; int ret = 0; - lockend = max_t(u64, root->sectorsize, lockend); + if (inode->i_size == 0) + return -ENXIO; + + /* + * *offset can be negative, in this case we start finding DATA/HOLE from + * the very start of the file. + */ + start = max_t(loff_t, 0, *offset); + + lockstart = round_down(start, root->sectorsize); + lockend = round_up(i_size_read(inode), root->sectorsize); if (lockend <= lockstart) lockend = lockstart + root->sectorsize; - lockend--; len = lockend - lockstart + 1; - len = max_t(u64, len, root->sectorsize); - if (inode->i_size == 0) - return -ENXIO; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, &cached_state); |