diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 289 |
1 files changed, 166 insertions, 123 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 24bfd7ff304..8a064734e6e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -148,6 +148,9 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + if (ext4_has_inline_data(inode)) + return 0; + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } @@ -215,7 +218,7 @@ void ext4_evict_inode(struct inode *inode) jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; @@ -226,7 +229,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); if (is_bad_inode(inode)) @@ -443,7 +446,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * could be converted. */ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -489,8 +492,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocate. - * if create==0 and the blocks are pre-allocated and uninitialized block, + * On success, it returns the number of blocks being mapped or allocated. + * if create==0 and the blocks are pre-allocated and unwritten block, * the result buffer head is unmapped. If the create ==1, it will make sure * the buffer head is mapped. * @@ -504,6 +507,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; + int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -515,6 +519,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + /* + * ext4_map_blocks returns an int, and m_len is an unsigned int + */ + if (unlikely(map->m_len > INT_MAX)) + map->m_len = INT_MAX; + + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); @@ -544,7 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -553,7 +567,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, EXT4_GET_BLOCKS_KEEP_SIZE); } if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -580,7 +593,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -597,7 +610,13 @@ found: * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - return retval; + /* + * If we need to convert extent to unwritten + * we continue and do the actual work in + * ext4_ext_map_blocks() + */ + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) + return retval; /* * Here we clear m_flags because after allocating an new extent, @@ -606,12 +625,12 @@ found: map->m_flags &= ~EXT4_MAP_FLAGS; /* - * New blocks allocate and/or writing to uninitialized extent + * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_blocks() * with create == 1 flag. */ - down_write((&EXT4_I(inode)->i_data_sem)); + down_write(&EXT4_I(inode)->i_data_sem); /* * if the caller is from delayed allocation writeout path @@ -653,7 +672,6 @@ found: ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -688,7 +706,7 @@ found: has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -907,6 +925,7 @@ int do_journal_get_write_access(handle_t *handle, */ if (dirty) clear_buffer_dirty(bh); + BUFFER_TRACE(bh, "get write access"); ret = ext4_journal_get_write_access(handle, bh); if (!ret && dirty) ret = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -1525,7 +1544,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, ext4_es_lru_add(inode); if (ext4_es_is_hole(&es)) { retval = 0; - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); goto add_delayed; } @@ -1562,7 +1581,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * Try to see if we can get the block without requesting a new * file system block. */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) { /* * We will soon create blocks for this page, and let @@ -1754,6 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); if (inline_data) { + BUFFER_TRACE(inode_bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode_bh); err = ext4_handle_dirty_metadata(handle, inode, inode_bh); @@ -1831,6 +1851,7 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; struct ext4_io_submit io_submit; + bool keep_towrite = false; trace_ext4_writepage(page); size = i_size_read(inode); @@ -1861,6 +1882,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return 0; } + keep_towrite = true; } if (PageChecked(page) && ext4_should_journal_data(inode)) @@ -1877,7 +1899,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return -ENOMEM; } - ret = ext4_bio_write_page(&io_submit, page, len, wbc); + ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); @@ -1896,7 +1918,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) else len = PAGE_CACHE_SIZE; clear_page_dirty_for_io(page); - err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; @@ -2017,7 +2039,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, - * and mark buffers as uninit when we perform writes to uninitialized extents + * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. @@ -2111,12 +2133,12 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; - int err; + int err, dioread_nolock; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or - * to convert an uninitialized extent to be initialized (in the case + * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in @@ -2133,7 +2155,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL; - if (ext4_should_dioread_nolock(inode)) + dioread_nolock = ext4_should_dioread_nolock(inode); + if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (map->m_flags & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; @@ -2141,7 +2164,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; - if (map->m_flags & EXT4_MAP_UNINIT) { + if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; @@ -2232,13 +2255,23 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } while (map->m_len); - /* Update on-disk size after IO is submitted */ + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; - - ext4_wb_update_i_disksize(inode, disksize); + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", @@ -3045,9 +3078,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * - * For holes, we fallocate those blocks, mark them as uninitialized + * For holes, we fallocate those blocks, mark them as unwritten * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as uninitialized. + * still keep the range to write as unwritten. * * The unwritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we @@ -3060,13 +3093,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * */ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int overwrite = 0; get_block_t *get_block_func = NULL; int dio_flags = 0; @@ -3075,7 +3107,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, /* Use the old path for reads and writes beyond i_size. */ if (rw != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + return ext4_ind_direct_IO(rw, iocb, iter, offset); BUG_ON(iocb->private == NULL); @@ -3099,12 +3131,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * We could direct write to holes and fallocate. * * Allocated blocks to fill the hole are marked as - * uninitialized to prevent parallel buffered read to expose + * unwritten to prevent parallel buffered read to expose * the stale data before DIO complete the data IO. * * As to previously fallocated extents, ext4 get_block will * just simply mark the buffer mapped but still keep the - * extents uninitialized. + * extents unwritten. * * For non AIO case, we will convert those unwritten extents * to written after return back from blockdev_direct_IO. @@ -3142,8 +3174,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, dio_flags = DIO_LOCKING; } ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, + inode->i_sb->s_bdev, iter, + offset, get_block_func, ext4_end_io_dio, NULL, @@ -3197,11 +3229,11 @@ retake_lock: } static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; /* @@ -3214,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_has_inline_data(inode)) return 0; - trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + trace_ext4_direct_IO_enter(inode, offset, count, rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + ret = ext4_ext_direct_IO(rw, iocb, iter, offset); else - ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); - trace_ext4_direct_IO_exit(inode, offset, - iov_length(iov, nr_segs), rw, ret); + ret = ext4_ind_direct_IO(rw, iocb, iter, offset); + trace_ext4_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -3313,33 +3344,13 @@ void ext4_set_aops(struct inode *inode) } /* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; - - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); - - return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that cooresponds to 'from' */ -int ext4_block_zero_page_range(handle_t *handle, +static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -3429,6 +3440,26 @@ unlock: return err; } +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + + return ext4_block_zero_page_range(handle, mapping, from, length); +} + int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { @@ -3502,7 +3533,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - trace_ext4_punch_hole(inode, offset, length); + trace_ext4_punch_hole(inode, offset, length, 0); /* * Write out all dirty pages to avoid race conditions @@ -3516,15 +3547,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3605,10 +3627,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* Now release the pages again to reduce race window */ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -3682,7 +3709,7 @@ void ext4_truncate(struct inode *inode) /* * There is a possibility that we're either freeing the inode - * or it completely new indode. In those cases we might not + * or it's a completely new inode. In those cases we might not * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) @@ -3934,8 +3961,8 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - set_mask_bits(&inode->i_flags, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4154,11 +4181,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); - inode->i_version = le32_to_cpu(raw_inode->i_disk_version); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - inode->i_version |= - (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } } ret = 0; @@ -4281,12 +4310,15 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; + struct super_block *sb = inode->i_sb; int err = 0, rc, block; - int need_datasync = 0; + int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; - /* For fields not not tracking in the in-memory inode, + spin_lock(&ei->i_raw_lock); + + /* For fields not tracked in the in-memory inode, * initialise them to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -4324,12 +4356,13 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - if (ext4_inode_blocks_set(handle, raw_inode, ei)) + if (ext4_inode_blocks_set(handle, raw_inode, ei)) { + spin_unlock(&ei->i_raw_lock); goto out_brelse; + } raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); @@ -4338,24 +4371,11 @@ static int ext4_do_update_inode(handle_t *handle, need_datasync = 1; } if (ei->i_disksize > 0x7fffffffULL) { - struct super_block *sb = inode->i_sb; if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || EXT4_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT4_GOOD_OLD_REV)) { - /* If this is the first large file - * created, add a flag to the superblock. - */ - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - goto out_brelse; - ext4_update_dynamic_rev(sb); - EXT4_SET_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE); - ext4_handle_sync(handle); - err = ext4_handle_dirty_super(handle, sb); - } + cpu_to_le32(EXT4_GOOD_OLD_REV)) + set_large_file = 1; } raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { @@ -4374,22 +4394,37 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_block[block] = ei->i_data[block]; } - raw_inode->i_disk_version = cpu_to_le32(inode->i_version); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - raw_inode->i_version_hi = - cpu_to_le32(inode->i_version >> 32); - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } } ext4_inode_csum_set(inode, raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) err = rc; ext4_clear_inode_state(inode, EXT4_STATE_NEW); - + if (set_large_file) { + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext4_update_dynamic_rev(sb); + EXT4_SET_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + ext4_handle_sync(handle); + err = ext4_handle_dirty_super(handle, sb); + } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_brelse: brelse(bh); @@ -4402,21 +4437,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4428,15 +4462,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -4446,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext4_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_force_commit(inode->i_sb); @@ -4456,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; - if (wbc->sync_mode == WB_SYNC_ALL) + /* + * sync(2) will flush the whole buffer cache. No need to do + * it here separately for each inode. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, |