diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_aops.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 1034 |
1 files changed, 536 insertions, 498 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 66abe36c121..c9af48fffcd 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -21,28 +21,32 @@ #include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_trans.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" -#include "xfs_btree.h" #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_iomap.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_bmap.h" +#include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> #include <linux/writeback.h> +/* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + IO_READ, /* mapping for a read */ + IO_DELAY, /* mapping covers delalloc region */ + IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ + IO_NEW /* just allocated */ +}; /* * Prime number of hash buckets since address is used as the key. @@ -81,18 +85,15 @@ void xfs_count_page_state( struct page *page, int *delalloc, - int *unmapped, int *unwritten) { struct buffer_head *bh, *head; - *delalloc = *unmapped = *unwritten = 0; + *delalloc = *unwritten = 0; bh = head = page_buffers(page); do { - if (buffer_uptodate(bh) && !buffer_mapped(bh)) - (*unmapped) = 1; - else if (buffer_unwritten(bh)) + if (buffer_unwritten(bh)) (*unwritten) = 1; else if (buffer_delay(bh)) (*delalloc) = 1; @@ -101,8 +102,9 @@ xfs_count_page_state( STATIC struct block_device * xfs_find_bdev_for_inode( - struct xfs_inode *ip) + struct inode *inode) { + struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; if (XFS_IS_REALTIME_INODE(ip)) @@ -163,14 +165,17 @@ xfs_ioend_new_eof( } /* - * Update on-disk file size now that data has been written to disk. - * The current in-memory file size is i_size. If a write is beyond - * eof i_new_size will be the intended file size until i_size is - * updated. If this write does not extend all the way to the valid - * file size then restrict this update to the end of the write. + * Update on-disk file size now that data has been written to disk. The + * current in-memory file size is i_size. If a write is beyond eof i_new_size + * will be the intended file size until i_size is updated. If this write does + * not extend all the way to the valid file size then restrict this update to + * the end of the write. + * + * This function does not block as blocking on the inode lock in IO completion + * can lead to IO completion order dependency deadlocks.. If it can't get the + * inode ilock it will return EAGAIN. Callers must handle this. */ - -STATIC void +STATIC int xfs_setfilesize( xfs_ioend_t *ioend) { @@ -178,19 +183,37 @@ xfs_setfilesize( xfs_fsize_t isize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - ASSERT(ioend->io_type != IOMAP_READ); + ASSERT(ioend->io_type != IO_READ); if (unlikely(ioend->io_error)) - return; + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return EAGAIN; - xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_ioend_new_eof(ioend); if (isize) { ip->i_d.di_size = isize; - xfs_mark_inode_dirty_sync(ip); + xfs_mark_inode_dirty(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Schedule IO completion handling on the final put of an ioend. + */ +STATIC void +xfs_finish_ioend( + struct xfs_ioend *ioend) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + if (ioend->io_type == IO_UNWRITTEN) + queue_work(xfsconvertd_workqueue, &ioend->io_work); + else + queue_work(xfsdatad_workqueue, &ioend->io_work); + } } /* @@ -198,19 +221,18 @@ xfs_setfilesize( */ STATIC void xfs_end_io( - struct work_struct *work) + struct work_struct *work) { - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - struct xfs_inode *ip = XFS_I(ioend->io_inode); + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = 0; /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IOMAP_UNWRITTEN && + if (ioend->io_type == IO_UNWRITTEN && likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { - int error; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); @@ -222,30 +244,37 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IOMAP_READ) - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); + if (ioend->io_type != IO_READ) { + error = xfs_setfilesize(ioend); + ASSERT(!error || error == EAGAIN); + } + + /* + * If we didn't complete processing of the ioend, requeue it to the + * tail of the workqueue for another attempt later. Otherwise destroy + * it. + */ + if (error == EAGAIN) { + atomic_inc(&ioend->io_remaining); + xfs_finish_ioend(ioend); + /* ensure we don't spin on blocked ioends */ + delay(1); + } else { + if (ioend->io_iocb) + aio_complete(ioend->io_iocb, ioend->io_result, 0); + xfs_destroy_ioend(ioend); + } } /* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. + * Call IO completion handling in caller context on the final put of an ioend. */ STATIC void -xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) +xfs_finish_ioend_sync( + struct xfs_ioend *ioend) { - if (atomic_dec_and_test(&ioend->io_remaining)) { - struct workqueue_struct *wq; - - wq = (ioend->io_type == IOMAP_UNWRITTEN) ? - xfsconvertd_workqueue : xfsdatad_workqueue; - queue_work(wq, &ioend->io_work); - if (wait) - flush_workqueue(wq); - } + if (atomic_dec_and_test(&ioend->io_remaining)) + xfs_end_io(&ioend->io_work); } /* @@ -278,6 +307,8 @@ xfs_alloc_ioend( atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; + ioend->io_iocb = NULL; + ioend->io_result = 0; INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; @@ -288,21 +319,25 @@ xfs_map_blocks( struct inode *inode, loff_t offset, ssize_t count, - xfs_iomap_t *mapp, + struct xfs_bmbt_irec *imap, int flags) { int nmaps = 1; + int new = 0; - return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); + return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); } STATIC int -xfs_iomap_valid( - xfs_iomap_t *iomapp, - loff_t offset) +xfs_imap_valid( + struct inode *inode, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - return offset >= iomapp->iomap_offset && - offset < iomapp->iomap_offset + iomapp->iomap_bsize; + offset >>= inode->i_blkbits; + + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; } /* @@ -323,7 +358,7 @@ xfs_end_bio( bio->bi_end_io = NULL; bio_put(bio); - xfs_finish_ioend(ioend, 0); + xfs_finish_ioend(ioend); } STATIC void @@ -341,7 +376,7 @@ xfs_submit_ioend_bio( * but don't update the inode size until I/O completion. */ if (xfs_ioend_new_eof(ioend)) - xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE, bio); @@ -465,7 +500,7 @@ xfs_submit_ioend( } if (bio) xfs_submit_ioend_bio(wbc, ioend, bio); - xfs_finish_ioend(ioend, 0); + xfs_finish_ioend(ioend); } while ((ioend = next) != NULL); } @@ -533,19 +568,23 @@ xfs_add_to_ioend( STATIC void xfs_map_buffer( + struct inode *inode, struct buffer_head *bh, - xfs_iomap_t *mp, - xfs_off_t offset, - uint block_bits) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); + xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); - ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + - ((offset - mp->iomap_offset) >> block_bits); + bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); - ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); bh->b_blocknr = bn; set_buffer_mapped(bh); @@ -553,17 +592,17 @@ xfs_map_buffer( STATIC void xfs_map_at_offset( + struct inode *inode, struct buffer_head *bh, - loff_t offset, - int block_bits, - xfs_iomap_t *iomapp) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); lock_buffer(bh); - xfs_map_buffer(bh, iomapp, offset, block_bits); - bh->b_bdev = iomapp->iomap_target->bt_bdev; + xfs_map_buffer(inode, bh, imap, offset); + bh->b_bdev = xfs_find_bdev_for_inode(inode); set_buffer_mapped(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); @@ -575,31 +614,30 @@ xfs_map_at_offset( STATIC unsigned int xfs_probe_page( struct page *page, - unsigned int pg_offset, - int mapped) + unsigned int pg_offset) { + struct buffer_head *bh, *head; int ret = 0; if (PageWriteback(page)) return 0; + if (!PageDirty(page)) + return 0; + if (!page->mapping) + return 0; + if (!page_has_buffers(page)) + return 0; - if (page->mapping && PageDirty(page)) { - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - if (!buffer_uptodate(bh)) - break; - if (mapped != buffer_mapped(bh)) - break; - ret += bh->b_size; - if (ret >= pg_offset) - break; - } while ((bh = bh->b_this_page) != head); - } else - ret = mapped ? 0 : PAGE_CACHE_SIZE; - } + bh = head = page_buffers(page); + do { + if (!buffer_uptodate(bh)) + break; + if (!buffer_mapped(bh)) + break; + ret += bh->b_size; + if (ret >= pg_offset) + break; + } while ((bh = bh->b_this_page) != head); return ret; } @@ -609,8 +647,7 @@ xfs_probe_cluster( struct inode *inode, struct page *startpage, struct buffer_head *bh, - struct buffer_head *head, - int mapped) + struct buffer_head *head) { struct pagevec pvec; pgoff_t tindex, tlast, tloff; @@ -619,7 +656,7 @@ xfs_probe_cluster( /* First sum forwards in this page */ do { - if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) + if (!buffer_uptodate(bh) || !buffer_mapped(bh)) return total; total += bh->b_size; } while ((bh = bh->b_this_page) != head); @@ -653,7 +690,7 @@ xfs_probe_cluster( pg_offset = PAGE_CACHE_SIZE; if (page->index == tindex && trylock_page(page)) { - pg_len = xfs_probe_page(page, pg_offset, mapped); + pg_len = xfs_probe_page(page, pg_offset); unlock_page(page); } @@ -692,11 +729,11 @@ xfs_is_delayed_page( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable = (type == IOMAP_UNWRITTEN); + acceptable = (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IOMAP_DELAY); + acceptable = (type == IO_DELAY); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IOMAP_NEW); + acceptable = (type == IO_NEW); else break; } while ((bh = bh->b_this_page) != head); @@ -719,17 +756,15 @@ xfs_convert_page( struct inode *inode, struct page *page, loff_t tindex, - xfs_iomap_t *mp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int startio, int all_bh) { struct buffer_head *bh, *head; xfs_off_t end_offset; unsigned long p_offset; unsigned int type; - int bbits = inode->i_blkbits; int len, page_dirty; int count = 0, done = 0, uptodate = 1; xfs_off_t offset = page_offset(page); @@ -781,32 +816,27 @@ xfs_convert_page( if (buffer_unwritten(bh) || buffer_delay(bh)) { if (buffer_unwritten(bh)) - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; else - type = IOMAP_DELAY; + type = IO_DELAY; - if (!xfs_iomap_valid(mp, offset)) { + if (!xfs_imap_valid(inode, imap, offset)) { done = 1; continue; } - ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + + xfs_map_at_offset(inode, bh, imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, + ioendp, done); - xfs_map_at_offset(bh, offset, bbits, mp); - if (startio) { - xfs_add_to_ioend(inode, bh, offset, - type, ioendp, done); - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } page_dirty--; count++; } else { - type = IOMAP_NEW; - if (buffer_mapped(bh) && all_bh && startio) { + type = IO_NEW; + if (buffer_mapped(bh) && all_bh) { lock_buffer(bh); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -821,14 +851,12 @@ xfs_convert_page( if (uptodate && bh == head) SetPageUptodate(page); - if (startio) { - if (count) { - wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) - done = 1; - } - xfs_start_page_writeback(page, !page_dirty, count); + if (count) { + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) + done = 1; } + xfs_start_page_writeback(page, !page_dirty, count); return done; fail_unlock_page: @@ -845,10 +873,9 @@ STATIC void xfs_cluster_write( struct inode *inode, pgoff_t tindex, - xfs_iomap_t *iomapp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int startio, int all_bh, pgoff_t tlast) { @@ -864,7 +891,7 @@ xfs_cluster_write( for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, - iomapp, ioendp, wbc, startio, all_bh); + imap, ioendp, wbc, all_bh); if (done) break; } @@ -874,51 +901,186 @@ xfs_cluster_write( } } +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned long offset) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset); + block_invalidatepage(page, offset); +} + /* - * Calling this without startio set means we are being asked to make a dirty - * page ready for freeing it's buffers. When called with startio set then - * we are coming from writepage. + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. * - * When called with startio set it is important that we write the WHOLE - * page if possible. - * The bh->b_state's cannot know if any of the blocks or which block for - * that matter are dirty due to mmap writes, and therefore bh uptodate is - * only valid if the page itself isn't completely uptodate. Some layers - * may clear the page dirty flag prior to calling write page, under the - * assumption the entire page will be written out; by not writing out the - * whole page the page can be reused before all valid dirty data is - * written out. Note: in the case of a page that has been dirty'd by - * mapwrite and but partially setup by block_prepare_write the - * bh->b_states's will not agree and only ones setup by BPW/BCW will have - * valid state, thus the whole page must be written out thing. + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + ssize_t len = 1 << inode->i_blkbits; + if (!xfs_is_delayed_page(page, IO_DELAY)) + goto out_invalidate; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_invalidate; + + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int done; + xfs_fileoff_t offset_fsb; + xfs_bmbt_irec_t imap; + int nimaps = 1; + int error; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + if (!buffer_delay(bh)) + goto next_buffer; + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, offset_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard failed delalloc mapping lookup."); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_buffer; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_buffer; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, + &flist, &done); + + ASSERT(!flist.xbf_count && !flist.xbf_first); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard unable to remove delalloc mapping."); + } + break; + } +next_buffer: + offset += len; + + } while ((bh = bh->b_this_page) != head); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_invalidate: + xfs_vm_invalidatepage(page, 0); + return; +} + +/* + * Write out a dirty page. + * + * For delalloc space on the page we need to allocate space and flush it. + * For unwritten space on the page we need to start the conversion to + * regular allocated space. + * For any other dirty buffer heads on the page we should flush them. + * + * If we detect that a transaction would be required to flush the page, we + * have to check the process flags first, if we are already in a transaction + * or disk I/O during allocations is off, we need to fail the writepage and + * redirty the page. + */ STATIC int -xfs_page_state_convert( - struct inode *inode, - struct page *page, - struct writeback_control *wbc, - int startio, - int unmapped) /* also implies page uptodate */ +xfs_vm_writepage( + struct page *page, + struct writeback_control *wbc) { + struct inode *inode = page->mapping->host; + int delalloc, unwritten; struct buffer_head *bh, *head; - xfs_iomap_t iomap; + struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; - unsigned long p_offset = 0; unsigned int type; __uint64_t end_offset; - pgoff_t end_index, last_index, tlast; + pgoff_t end_index, last_index; ssize_t size, len; - int flags, err, iomap_valid = 0, uptodate = 1; - int page_dirty, count = 0; - int trylock = 0; - int all_bh = unmapped; - - if (startio) { - if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) - trylock |= BMAPI_TRYLOCK; - } + int flags, err, imap_valid = 0, uptodate = 1; + int count = 0; + int all_bh = 0; + + trace_xfs_writepage(inode, page, 0); + + ASSERT(page_has_buffers(page)); + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should really be done by the core VM, but until that happens + * filesystems like XFS, btrfs and ext4 have to take care of this + * by themselves. + */ + if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) + goto redirty; + + /* + * We need a transaction if there are delalloc or unwritten buffers + * on the page. + * + * If we need a transaction and the process flags say we are already + * in a transaction, or no IO is allowed then mark the page dirty + * again and leave the page as is. + */ + xfs_count_page_state(page, &delalloc, &unwritten); + if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) + goto redirty; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -927,92 +1089,63 @@ xfs_page_state_convert( if (page->index >= end_index) { if ((page->index >= end_index + 1) || !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { - if (startio) - unlock_page(page); + unlock_page(page); return 0; } } - /* - * page_dirty is initially a count of buffers on the page before - * EOF and is decremented as we move each into a cleanable state. - * - * Derivation: - * - * End offset is the highest offset that this page should represent. - * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) - * will evaluate non-zero and be less than PAGE_CACHE_SIZE and - * hence give us the correct page_dirty count. On any other page, - * it will be zero and in that case we need page_dirty to be the - * count of buffers on the page. - */ end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, + offset); len = 1 << inode->i_blkbits; - p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), - PAGE_CACHE_SIZE); - p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; - page_dirty = p_offset / len; bh = head = page_buffers(page); offset = page_offset(page); flags = BMAPI_READ; - type = IOMAP_NEW; - - /* TODO: cleanup count and page_dirty */ + type = IO_NEW; do { if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; - if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) { - /* - * the iomap is actually still valid, but the ioend - * isn't. shouldn't happen too often. - */ - iomap_valid = 0; + + /* + * A hole may still be marked uptodate because discard_buffer + * leaves the flag set. + */ + if (!buffer_mapped(bh) && buffer_uptodate(bh)) { + ASSERT(!buffer_dirty(bh)); + imap_valid = 0; continue; } - if (iomap_valid) - iomap_valid = xfs_iomap_valid(&iomap, offset); + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); - /* - * First case, map an unwritten extent and prepare for - * extent state conversion transaction on completion. - * - * Second case, allocate space for a delalloc buffer. - * We can return EAGAIN here in the release page case. - * - * Third case, an unmapped buffer was found, and we are - * in a path where we need to write the whole page out. - */ - if (buffer_unwritten(bh) || buffer_delay(bh) || - ((buffer_uptodate(bh) || PageUptodate(page)) && - !buffer_mapped(bh) && (unmapped || startio))) { + if (buffer_unwritten(bh) || buffer_delay(bh)) { int new_ioend = 0; /* * Make sure we don't use a read-only iomap */ if (flags == BMAPI_READ) - iomap_valid = 0; + imap_valid = 0; if (buffer_unwritten(bh)) { - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { - type = IOMAP_DELAY; - flags = BMAPI_ALLOCATE | trylock; - } else { - type = IOMAP_NEW; - flags = BMAPI_WRITE | BMAPI_MMAP; + type = IO_DELAY; + flags = BMAPI_ALLOCATE; + + if (wbc->sync_mode == WB_SYNC_NONE) + flags |= BMAPI_TRYLOCK; } - if (!iomap_valid) { + if (!imap_valid) { /* - * if we didn't have a valid mapping then we + * If we didn't have a valid mapping then we * need to ensure that we put the new mapping * in a new ioend structure. This needs to be * done to ensure that the ioends correctly @@ -1020,74 +1153,57 @@ xfs_page_state_convert( * for unwritten extent conversion. */ new_ioend = 1; - if (type == IOMAP_NEW) { - size = xfs_probe_cluster(inode, - page, bh, head, 0); - } else { - size = len; - } - - err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + err = xfs_map_blocks(inode, offset, len, + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } - if (iomap_valid) { - xfs_map_at_offset(bh, offset, - inode->i_blkbits, &iomap); - if (startio) { - xfs_add_to_ioend(inode, bh, offset, - type, &ioend, - new_ioend); - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } - page_dirty--; + if (imap_valid) { + xfs_map_at_offset(inode, bh, &imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, + &ioend, new_ioend); count++; } - } else if (buffer_uptodate(bh) && startio) { + } else if (buffer_uptodate(bh)) { /* * we got here because the buffer is already mapped. * That means it must already have extents allocated * underneath it. Map the extent by reading it. */ - if (!iomap_valid || flags != BMAPI_READ) { + if (!imap_valid || flags != BMAPI_READ) { flags = BMAPI_READ; - size = xfs_probe_cluster(inode, page, bh, - head, 1); + size = xfs_probe_cluster(inode, page, bh, head); err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } /* - * We set the type to IOMAP_NEW in case we are doing a + * We set the type to IO_NEW in case we are doing a * small write at EOF that is extending the file but * without needing an allocation. We need to update the * file size on I/O completion in this case so it is * the same case as having just allocated a new extent * that we are writing into for the first time. */ - type = IOMAP_NEW; + type = IO_NEW; if (trylock_buffer(bh)) { - ASSERT(buffer_mapped(bh)); - if (iomap_valid) + if (imap_valid) all_bh = 1; xfs_add_to_ioend(inode, bh, offset, type, - &ioend, !iomap_valid); - page_dirty--; + &ioend, !imap_valid); count++; } else { - iomap_valid = 0; + imap_valid = 0; } - } else if ((buffer_uptodate(bh) || PageUptodate(page)) && - (unmapped || startio)) { - iomap_valid = 0; + } else if (PageUptodate(page)) { + ASSERT(buffer_mapped(bh)); + imap_valid = 0; } if (!iohead) @@ -1098,132 +1214,48 @@ xfs_page_state_convert( if (uptodate && bh == head) SetPageUptodate(page); - if (startio) - xfs_start_page_writeback(page, 1, count); + xfs_start_page_writeback(page, 1, count); - if (ioend && iomap_valid) { - offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> - PAGE_CACHE_SHIFT; - tlast = min_t(pgoff_t, offset, last_index); - xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, - wbc, startio, all_bh, tlast); - } + if (ioend && imap_valid) { + xfs_off_t end_index; - if (iohead) - xfs_submit_ioend(wbc, iohead); - - return page_dirty; - -error: - if (iohead) - xfs_cancel_ioend(iohead); - - /* - * If it's delalloc and we have nowhere to put it, - * throw it away, unless the lower layers told - * us to try again. - */ - if (err != -EAGAIN) { - if (!unmapped) - block_invalidatepage(page, 0); - ClearPageUptodate(page); - } - return err; -} - -/* - * writepage: Called from one of two places: - * - * 1. we are flushing a delalloc buffer head. - * - * 2. we are writing out a dirty page. Typically the page dirty - * state is cleared before we get here. In this case is it - * conceivable we have no buffer heads. - * - * For delalloc space on the page we need to allocate space and - * flush it. For unmapped buffer heads on the page we should - * allocate space if the page is uptodate. For any other dirty - * buffer heads on the page we should flush them. - * - * If we detect that a transaction would be required to flush - * the page, we have to check the process flags first, if we - * are already in a transaction or disk I/O during allocations - * is off, we need to fail the writepage and redirty the page. - */ + end_index = imap.br_startoff + imap.br_blockcount; -STATIC int -xfs_vm_writepage( - struct page *page, - struct writeback_control *wbc) -{ - int error; - int need_trans; - int delalloc, unmapped, unwritten; - struct inode *inode = page->mapping->host; + /* to bytes */ + end_index <<= inode->i_blkbits; - trace_xfs_writepage(inode, page, 0); + /* to pages */ + end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; - /* - * We need a transaction if: - * 1. There are delalloc buffers on the page - * 2. The page is uptodate and we have unmapped buffers - * 3. The page is uptodate and we have no buffers - * 4. There are unwritten buffers on the page - */ + /* check against file size */ + if (end_index > last_index) + end_index = last_index; - if (!page_has_buffers(page)) { - unmapped = 1; - need_trans = 1; - } else { - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - if (!PageUptodate(page)) - unmapped = 0; - need_trans = delalloc + unmapped + unwritten; + xfs_cluster_write(inode, page->index + 1, &imap, &ioend, + wbc, all_bh, end_index); } - /* - * If we need a transaction and the process flags say - * we are already in a transaction, or no IO is allowed - * then mark the page dirty again and leave the page - * as is. - */ - if (current_test_flags(PF_FSTRANS) && need_trans) - goto out_fail; - - /* - * Delay hooking up buffer heads until we have - * made our go/no-go decision. - */ - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); + if (iohead) + xfs_submit_ioend(wbc, iohead); + return 0; - /* - * VM calculation for nr_to_write seems off. Bump it way - * up, this gets simple streaming writes zippy again. - * To be reviewed again after Jens' writeback changes. - */ - wbc->nr_to_write *= 4; +error: + if (iohead) + xfs_cancel_ioend(iohead); - /* - * Convert delayed allocate, unwritten or unmapped space - * to real space and flush out to disk. - */ - error = xfs_page_state_convert(inode, page, wbc, 1, unmapped); - if (error == -EAGAIN) - goto out_fail; - if (unlikely(error < 0)) - goto out_unlock; + if (err == -EAGAIN) + goto redirty; - return 0; + xfs_aops_discard_page(page); + ClearPageUptodate(page); + unlock_page(page); + return err; -out_fail: +redirty: redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; -out_unlock: - unlock_page(page); - return error; } STATIC int @@ -1237,65 +1269,27 @@ xfs_vm_writepages( /* * Called to move a page into cleanable state - and from there - * to be released. Possibly the page is already clean. We always + * to be released. The page should already be clean. We always * have buffer heads in this call. * - * Returns 0 if the page is ok to release, 1 otherwise. - * - * Possible scenarios are: - * - * 1. We are being called to release a page which has been written - * to via regular I/O. buffer heads will be dirty and possibly - * delalloc. If no delalloc buffer heads in this case then we - * can just return zero. - * - * 2. We are called to release a page which has been written via - * mmap, all we need to do is ensure there is no delalloc - * state in the buffer heads, if not we can let the caller - * free them and we should come back later via writepage. + * Returns 1 if the page is ok to release, 0 otherwise. */ STATIC int xfs_vm_releasepage( struct page *page, gfp_t gfp_mask) { - struct inode *inode = page->mapping->host; - int dirty, delalloc, unmapped, unwritten; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - }; + int delalloc, unwritten; - trace_xfs_releasepage(inode, page, 0); + trace_xfs_releasepage(page->mapping->host, page, 0); - if (!page_has_buffers(page)) - return 0; - - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - if (!delalloc && !unwritten) - goto free_buffers; + xfs_count_page_state(page, &delalloc, &unwritten); - if (!(gfp_mask & __GFP_FS)) + if (WARN_ON(delalloc)) return 0; - - /* If we are already inside a transaction or the thread cannot - * do I/O, we cannot release this page. - */ - if (current_test_flags(PF_FSTRANS)) + if (WARN_ON(unwritten)) return 0; - /* - * Convert delalloc space to real space, do not flush the - * data out to disk, that will be done by the caller. - * Never need to allocate space here - we will always - * come back to writepage in that case. - */ - dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0); - if (dirty == 0 && !unwritten) - goto free_buffers; - return 0; - -free_buffers: return try_to_free_buffers(page); } @@ -1305,13 +1299,14 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - int direct, - bmapi_flags_t flags) + int direct) { - xfs_iomap_t iomap; + int flags = create ? BMAPI_WRITE : BMAPI_READ; + struct xfs_bmbt_irec imap; xfs_off_t offset; ssize_t size; - int niomap = 1; + int nimap = 1; + int new = 0; int error; offset = (xfs_off_t)iblock << inode->i_blkbits; @@ -1321,23 +1316,25 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; - error = xfs_iomap(XFS_I(inode), offset, size, - create ? flags : BMAPI_READ, &iomap, &niomap); + if (direct && create) + flags |= BMAPI_DIRECT; + + error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, + &new); if (error) return -error; - if (niomap == 0) + if (nimap == 0) return 0; - if (iomap.iomap_bn != IOMAP_DADDR_NULL) { + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK) { /* * For unwritten extents do not report a disk address on * the read case (treat as if we're reading into a hole). */ - if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { - xfs_map_buffer(bh_result, &iomap, offset, - inode->i_blkbits); - } - if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) { + if (create || !ISUNWRITTEN(&imap)) + xfs_map_buffer(inode, bh_result, &imap, offset); + if (create && ISUNWRITTEN(&imap)) { if (direct) bh_result->b_private = inode; set_buffer_unwritten(bh_result); @@ -1348,7 +1345,7 @@ __xfs_get_blocks( * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ - bh_result->b_bdev = iomap.iomap_target->bt_bdev; + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); /* * If we previously allocated a block out beyond eof and we are now @@ -1362,10 +1359,10 @@ __xfs_get_blocks( if (create && ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || (offset >= i_size_read(inode)) || - (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) + (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (iomap.iomap_flags & IOMAP_DELAY) { + if (imap.br_startblock == DELAYSTARTBLOCK) { BUG_ON(direct); if (create) { set_buffer_uptodate(bh_result); @@ -1374,11 +1371,23 @@ __xfs_get_blocks( } } + /* + * If this is O_DIRECT or the mpage code calling tell them how large + * the mapping is, so that we can avoid repeated get_blocks calls. + */ if (direct || size > (1 << inode->i_blkbits)) { - ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); - offset = min_t(xfs_off_t, - iomap.iomap_bsize - iomap.iomap_delta, size); - bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); + xfs_off_t mapping_size; + + mapping_size = imap.br_startoff + imap.br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; } return 0; @@ -1391,8 +1400,7 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, - bh_result, create, 0, BMAPI_WRITE); + return __xfs_get_blocks(inode, iblock, bh_result, create, 0); } STATIC int @@ -1402,61 +1410,59 @@ xfs_get_blocks_direct( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, - bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT); + return __xfs_get_blocks(inode, iblock, bh_result, create, 1); } +/* + * Complete a direct I/O write request. + * + * If the private argument is non-NULL __xfs_get_blocks signals us that we + * need to issue a transaction to convert the range from unwritten to written + * extents. In case this is regular synchronous I/O we just call xfs_end_io + * to do this and we are done. But in case this was a successfull AIO + * request this handler is called from interrupt context, from which we + * can't start transactions. In that case offload the I/O completion to + * the workqueues we also use for buffered I/O completion. + */ STATIC void -xfs_end_io_direct( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private, + int ret, + bool is_async) { - xfs_ioend_t *ioend = iocb->private; + struct xfs_ioend *ioend = iocb->private; /* - * Non-NULL private data means we need to issue a transaction to - * convert a range from unwritten to written extents. This needs - * to happen from process context but aio+dio I/O completion - * happens from irq context so we need to defer it to a workqueue. - * This is not necessary for synchronous direct I/O, but we do - * it anyway to keep the code uniform and simpler. - * - * Well, if only it were that simple. Because synchronous direct I/O - * requires extent conversion to occur *before* we return to userspace, - * we have to wait for extent conversion to complete. Look at the - * iocb that has been passed to us to determine if this is AIO or - * not. If it is synchronous, tell xfs_finish_ioend() to kick the - * workqueue and wait for it to complete. - * - * The core direct I/O code might be changed to always call the - * completion handler in the future, in which case all this can - * go away. + * blockdev_direct_IO can return an error even after the I/O + * completion handler was called. Thus we need to protect + * against double-freeing. */ + iocb->private = NULL; + ioend->io_offset = offset; ioend->io_size = size; - if (ioend->io_type == IOMAP_READ) { - xfs_finish_ioend(ioend, 0); - } else if (private && size > 0) { - xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); - } else { + if (private && size > 0) + ioend->io_type = IO_UNWRITTEN; + + if (is_async) { /* - * A direct I/O write ioend starts it's life in unwritten - * state in case they map an unwritten extent. This write - * didn't map an unwritten extent so switch it's completion - * handler. + * If we are converting an unwritten extent we need to delay + * the AIO completion until after the unwrittent extent + * conversion has completed, otherwise do it ASAP. */ - ioend->io_type = IOMAP_NEW; - xfs_finish_ioend(ioend, 0); + if (ioend->io_type == IO_UNWRITTEN) { + ioend->io_iocb = iocb; + ioend->io_result = ret; + } else { + aio_complete(iocb, ret, 0); + } + xfs_finish_ioend(ioend); + } else { + xfs_finish_ioend_sync(ioend); } - - /* - * blockdev_direct_IO can return an error even after the I/O - * completion handler was called. Thus we need to protect - * against double-freeing. - */ - iocb->private = NULL; } STATIC ssize_t @@ -1467,26 +1473,45 @@ xfs_vm_direct_IO( loff_t offset, unsigned long nr_segs) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct block_device *bdev; - ssize_t ret; + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct block_device *bdev = xfs_find_bdev_for_inode(inode); + ssize_t ret; - bdev = xfs_find_bdev_for_inode(XFS_I(inode)); + if (rw & WRITE) { + iocb->private = xfs_alloc_ioend(inode, IO_NEW); - iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? - IOMAP_UNWRITTEN : IOMAP_READ); - - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, - xfs_end_io_direct); + xfs_end_io_direct_write, NULL, 0); + if (ret != -EIOCBQUEUED && iocb->private) + xfs_destroy_ioend(iocb->private); + } else { + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + NULL, NULL, 0); + } - if (unlikely(ret != -EIOCBQUEUED && iocb->private)) - xfs_destroy_ioend(iocb->private); return ret; } +STATIC void +xfs_vm_write_failed( + struct address_space *mapping, + loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + struct iattr ia = { + .ia_valid = ATTR_SIZE | ATTR_FORCE, + .ia_size = inode->i_size, + }; + xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); + } +} + STATIC int xfs_vm_write_begin( struct file *file, @@ -1497,9 +1522,31 @@ xfs_vm_write_begin( struct page **pagep, void **fsdata) { - *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - xfs_get_blocks); + int ret; + + ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS, + pagep, xfs_get_blocks); + if (unlikely(ret)) + xfs_vm_write_failed(mapping, pos + len); + return ret; +} + +STATIC int +xfs_vm_write_end( + struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned copied, + struct page *page, + void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (unlikely(ret < len)) + xfs_vm_write_failed(mapping, pos + len); + return ret; } STATIC sector_t @@ -1510,7 +1557,7 @@ xfs_vm_bmap( struct inode *inode = (struct inode *)mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_itrace_entry(XFS_I(inode)); + trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -1535,15 +1582,6 @@ xfs_vm_readpages( return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned long offset) -{ - trace_xfs_invalidatepage(page->mapping->host, page, offset); - block_invalidatepage(page, offset); -} - const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, @@ -1553,7 +1591,7 @@ const struct address_space_operations xfs_address_space_operations = { .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, .write_begin = xfs_vm_write_begin, - .write_end = generic_write_end, + .write_end = xfs_vm_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, |