From d3cf209476b72c83907a412b6708c5e498410aa7 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Tue, 8 May 2007 13:49:27 +1000 Subject: [XFS] propogate return codes from flush routines This patch handles error return values in fs_flush_pages and fs_flushinval_pages. It changes the prototype of fs_flushinval_pages so we can propogate the errors and handle them at higher layers. I also modified xfs_itruncate_start so that it could propogate the error further. SGI-PV: 961990 SGI-Modid: xfs-linux-melb:xfs-kern:28231a Signed-off-by: Lachlan McIlroy Signed-off-by: Stewart Smith Signed-off-by: Tim Shimmin --- fs/xfs/xfs_inode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs/xfs_inode.h') diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index bc823720d88..699960f3459 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -481,7 +481,7 @@ uint xfs_ip2xflags(struct xfs_inode *); uint xfs_dic2xflags(struct xfs_dinode_core *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); -void xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); +int xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, xfs_fsize_t, int, int); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); -- cgit v1.2.3-70-g09d2 From ba87ea699ebd9dd577bf055ebc4a98200e337542 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Tue, 8 May 2007 13:49:46 +1000 Subject: [XFS] Fix to prevent the notorious 'NULL files' problem after a crash. The problem that has been addressed is that of synchronising updates of the file size with writes that extend a file. Without the fix the update of a file's size, as a result of a write beyond eof, is independent of when the cached data is flushed to disk. Often the file size update would be written to the filesystem log before the data is flushed to disk. When a system crashes between these two events and the filesystem log is replayed on mount the file's size will be set but since the contents never made it to disk the file is full of holes. If some of the cached data was flushed to disk then it may just be a section of the file at the end that has holes. There are existing fixes to help alleviate this problem, particularly in the case where a file has been truncated, that force cached data to be flushed to disk when the file is closed. If the system crashes while the file(s) are still open then this flushing will never occur. The fix that we have implemented is to introduce a second file size, called the in-memory file size, that represents the current file size as viewed by the user. The existing file size, called the on-disk file size, is the one that get's written to the filesystem log and we only update it when it is safe to do so. When we write to a file beyond eof we only update the in- memory file size in the write operation. Later when the I/O operation, that flushes the cached data to disk completes, an I/O completion routine will update the on-disk file size. The on-disk file size will be updated to the maximum offset of the I/O or to the value of the in-memory file size if the I/O includes eof. SGI-PV: 958522 SGI-Modid: xfs-linux-melb:xfs-kern:28322a Signed-off-by: Lachlan McIlroy Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_aops.c | 89 ++++++++++++++++++++++++++++++++++++++------ fs/xfs/linux-2.6/xfs_lrw.c | 91 +++++++++++++++++++++++++++------------------ fs/xfs/xfs_bmap.c | 14 ++++--- fs/xfs/xfs_inode.c | 48 +++++++++++++++++++----- fs/xfs/xfs_inode.h | 3 ++ fs/xfs/xfs_iocore.c | 2 +- fs/xfs/xfs_iomap.c | 8 ++-- fs/xfs/xfs_iomap.h | 1 + fs/xfs/xfs_vnodeops.c | 40 ++++++++++---------- 9 files changed, 208 insertions(+), 88 deletions(-) (limited to 'fs/xfs/xfs_inode.h') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 143ffc851c9..4475588e973 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -140,10 +140,47 @@ xfs_destroy_ioend( mempool_free(ioend, xfs_ioend_pool); } +/* + * Update on-disk file size now that data has been written to disk. + * The current in-memory file size is i_size. If a write is beyond + * eof io_new_size will be the intended file size until i_size is + * updated. If this write does not extend all the way to the valid + * file size then restrict this update to the end of the write. + */ +STATIC void +xfs_setfilesize( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip; + xfs_fsize_t isize; + xfs_fsize_t bsize; + + ip = xfs_vtoi(ioend->io_vnode); + + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + ASSERT(ioend->io_type != IOMAP_READ); + + if (unlikely(ioend->io_error)) + return; + + bsize = ioend->io_offset + ioend->io_size; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + isize = MAX(ip->i_size, ip->i_iocore.io_new_size); + isize = MIN(isize, bsize); + + if (ip->i_d.di_size < isize) { + ip->i_d.di_size = isize; + ip->i_update_core = 1; + ip->i_update_size = 1; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + /* * Buffered IO write completion for delayed allocate extents. - * TODO: Update ondisk isize now that we know the file data - * has been flushed (i.e. the notorious "NULL file" problem). */ STATIC void xfs_end_bio_delalloc( @@ -152,6 +189,7 @@ xfs_end_bio_delalloc( xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + xfs_setfilesize(ioend); xfs_destroy_ioend(ioend); } @@ -165,6 +203,7 @@ xfs_end_bio_written( xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + xfs_setfilesize(ioend); xfs_destroy_ioend(ioend); } @@ -184,8 +223,23 @@ xfs_end_bio_unwritten( xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; - if (likely(!ioend->io_error)) + if (likely(!ioend->io_error)) { bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL); + xfs_setfilesize(ioend); + } + xfs_destroy_ioend(ioend); +} + +/* + * IO read completion for regular, written extents. + */ +STATIC void +xfs_end_bio_read( + struct work_struct *work) +{ + xfs_ioend_t *ioend = + container_of(work, xfs_ioend_t, io_work); + xfs_destroy_ioend(ioend); } @@ -224,6 +278,8 @@ xfs_alloc_ioend( INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten); else if (type == IOMAP_DELAY) INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc); + else if (type == IOMAP_READ) + INIT_WORK(&ioend->io_work, xfs_end_bio_read); else INIT_WORK(&ioend->io_work, xfs_end_bio_written); @@ -913,7 +969,7 @@ xfs_page_state_convert( bh = head = page_buffers(page); offset = page_offset(page); flags = -1; - type = 0; + type = IOMAP_READ; /* TODO: cleanup count and page_dirty */ @@ -999,7 +1055,7 @@ xfs_page_state_convert( * That means it must already have extents allocated * underneath it. Map the extent by reading it. */ - if (!iomap_valid || type != 0) { + if (!iomap_valid || type != IOMAP_READ) { flags = BMAPI_READ; size = xfs_probe_cluster(inode, page, bh, head, 1); @@ -1010,7 +1066,7 @@ xfs_page_state_convert( iomap_valid = xfs_iomap_valid(&iomap, offset); } - type = 0; + type = IOMAP_READ; if (!test_and_set_bit(BH_Lock, &bh->b_state)) { ASSERT(buffer_mapped(bh)); if (iomap_valid) @@ -1356,12 +1412,21 @@ xfs_end_io_direct( * completion handler in the future, in which case all this can * go away. */ - if (private && size > 0) { - ioend->io_offset = offset; - ioend->io_size = size; + ioend->io_offset = offset; + ioend->io_size = size; + if (ioend->io_type == IOMAP_READ) { + xfs_finish_ioend(ioend); + } else if (private && size > 0) { xfs_finish_ioend(ioend); } else { - xfs_destroy_ioend(ioend); + /* + * A direct I/O write ioend starts it's life in unwritten + * state in case they map an unwritten extent. This write + * didn't map an unwritten extent so switch it's completion + * handler. + */ + INIT_WORK(&ioend->io_work, xfs_end_bio_written); + xfs_finish_ioend(ioend); } /* @@ -1392,15 +1457,15 @@ xfs_vm_direct_IO( if (error) return -error; - iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); - if (rw == WRITE) { + iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); ret = blockdev_direct_IO_own_locking(rw, iocb, inode, iomap.iomap_target->bt_bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct); } else { + iocb->private = xfs_alloc_ioend(inode, IOMAP_READ); ret = blockdev_direct_IO_no_locking(rw, iocb, inode, iomap.iomap_target->bt_bdev, iov, offset, nr_segs, diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 80fe3123347..82ab792c7fc 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -224,7 +224,7 @@ xfs_read( mp->m_rtdev_targp : mp->m_ddev_targp; if ((*offset & target->bt_smask) || (size & target->bt_smask)) { - if (*offset == ip->i_d.di_size) { + if (*offset == ip->i_size) { return (0); } return -XFS_ERROR(EINVAL); @@ -387,9 +387,10 @@ xfs_splice_write( { xfs_inode_t *ip = XFS_BHVTOI(bdp); xfs_mount_t *mp = ip->i_mount; + xfs_iocore_t *io = &ip->i_iocore; ssize_t ret; struct inode *inode = outfilp->f_mapping->host; - xfs_fsize_t isize; + xfs_fsize_t isize, new_size; XFS_STATS_INC(xs_write_calls); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -410,6 +411,14 @@ xfs_splice_write( return -error; } } + + new_size = *ppos + count; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (new_size > ip->i_size) + io->io_new_size = new_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, pipe, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); @@ -420,14 +429,18 @@ xfs_splice_write( if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) *ppos = isize; - if (*ppos > ip->i_d.di_size) { + if (*ppos > ip->i_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_d.di_size) { - ip->i_d.di_size = *ppos; - i_size_write(inode, *ppos); - ip->i_update_core = 1; - ip->i_update_size = 1; - } + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (io->io_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + io->io_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; xfs_iunlock(ip, XFS_ILOCK_EXCL); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -711,8 +724,6 @@ start: goto out_unlock_mutex; } - isize = i_size_read(inode); - if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? @@ -723,7 +734,7 @@ start: return XFS_ERROR(-EINVAL); } - if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) { + if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) { xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; @@ -735,7 +746,7 @@ start: } new_size = pos + count; - if (new_size > isize) + if (new_size > xip->i_size) io->io_new_size = new_size; if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && @@ -751,8 +762,7 @@ start: pos, count, dmflags, &locktype); if (error) { - xfs_iunlock(xip, iolock); - goto out_unlock_mutex; + goto out_unlock_internal; } xfs_ilock(xip, XFS_ILOCK_EXCL); eventsent = 1; @@ -764,9 +774,8 @@ start: * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ - if ((file->f_flags & O_APPEND) && savedsize != isize) { + if ((file->f_flags & O_APPEND) && savedsize != xip->i_size) goto start; - } } if (likely(!(ioflags & IO_INVIS))) { @@ -784,11 +793,11 @@ start: * to zero it out up to the new size. */ - if (pos > isize) { - error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize); + if (pos > xip->i_size) { + error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, xip->i_size); if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; + xfs_iunlock(xip, XFS_ILOCK_EXCL); + goto out_unlock_internal; } } xfs_iunlock(xip, XFS_ILOCK_EXCL); @@ -808,8 +817,7 @@ start: if (likely(!error)) error = -remove_suid(file->f_path.dentry); if (unlikely(error)) { - xfs_iunlock(xip, iolock); - goto out_unlock_mutex; + goto out_unlock_internal; } } @@ -879,12 +887,12 @@ retry: error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ - if (error) - goto out_nounlocks; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_rwlock(bdp, locktype); - pos = xip->i_d.di_size; + if (error) + goto out_unlock_internal; + pos = xip->i_size; ret = 0; goto retry; } @@ -893,14 +901,10 @@ retry: if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) *offset = isize; - if (*offset > xip->i_d.di_size) { + if (*offset > xip->i_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); - if (*offset > xip->i_d.di_size) { - xip->i_d.di_size = *offset; - i_size_write(inode, *offset); - xip->i_update_core = 1; - xip->i_update_size = 1; - } + if (*offset > xip->i_size) + xip->i_size = *offset; xfs_iunlock(xip, XFS_ILOCK_EXCL); } @@ -922,16 +926,31 @@ retry: error = sync_page_range(inode, mapping, pos, ret); if (!error) - error = ret; - return error; + error = -ret; + if (need_i_mutex) + mutex_lock(&inode->i_mutex); + xfs_rwlock(bdp, locktype); } out_unlock_internal: + if (io->io_new_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + io->io_new_size = 0; + /* + * If this was a direct or synchronous I/O that failed (such + * as ENOSPC) then part of the I/O may have been written to + * disk before the error occured. In this case the on-disk + * file size may have been adjusted beyond the in-memory file + * size and now needs to be truncated back. + */ + if (xip->i_d.di_size > xip->i_size) + xip->i_d.di_size = xip->i_size; + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } xfs_rwunlock(bdp, locktype); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); - out_nounlocks: return -error; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 50f2213589f..b1ea26e40aa 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4444,8 +4444,11 @@ xfs_bmap_one_block( xfs_bmbt_irec_t s; /* internal version of extent */ #ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize; + if (whichfork == XFS_DATA_FORK) { + return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ? + (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : + (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); + } #endif /* !DEBUG */ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) return 0; @@ -4457,7 +4460,7 @@ xfs_bmap_one_block( xfs_bmbt_get_all(ep, &s); rval = s.br_startoff == 0 && s.br_blockcount == 1; if (rval && whichfork == XFS_DATA_FORK) - ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); + ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); return rval; } @@ -5817,7 +5820,7 @@ xfs_getbmap( fixlen = XFS_MAXIOFFSET(mp); } else { prealloced = 0; - fixlen = ip->i_d.di_size; + fixlen = ip->i_size; } } else { prealloced = 0; @@ -5841,7 +5844,8 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) { + if (whichfork == XFS_DATA_FORK && + (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) { /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7d1ab3967b8..3ca5d43b834 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -442,6 +442,7 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } ip->i_d.di_size = 0; + ip->i_size = 0; ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); break; @@ -980,6 +981,7 @@ xfs_iread( } ip->i_delayed_blks = 0; + ip->i_size = ip->i_d.di_size; /* * Mark the buffer containing the inode as something to keep @@ -1170,6 +1172,7 @@ xfs_ialloc( } ip->i_d.di_size = 0; + ip->i_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD); @@ -1340,7 +1343,7 @@ xfs_file_last_byte( } else { last_block = 0; } - size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size); + size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size); last_block = XFS_FILEOFF_MAX(last_block, size_last_block); last_byte = XFS_FSB_TO_B(mp, last_block); @@ -1434,7 +1437,7 @@ xfs_itruncate_start( int error = 0; ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); - ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); + ASSERT((new_size == 0) || (new_size <= ip->i_size)); ASSERT((flags == XFS_ITRUNC_DEFINITE) || (flags == XFS_ITRUNC_MAYBE)); @@ -1558,7 +1561,7 @@ xfs_itruncate_finish( ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); - ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); + ASSERT((new_size == 0) || (new_size <= ip->i_size)); ASSERT(*tp != NULL); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_transp == *tp); @@ -1632,8 +1635,20 @@ xfs_itruncate_finish( */ if (fork == XFS_DATA_FORK) { if (ip->i_d.di_nextents > 0) { - ip->i_d.di_size = new_size; - xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + /* + * If we are not changing the file size then do + * not update the on-disk file size - we may be + * called from xfs_inactive_free_eofblocks(). If we + * update the on-disk file size and then the system + * crashes before the contents of the file are + * flushed to disk then the files may be full of + * holes (ie NULL files bug). + */ + if (ip->i_size != new_size) { + ip->i_d.di_size = new_size; + ip->i_size = new_size; + xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + } } } else if (sync) { ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); @@ -1769,7 +1784,19 @@ xfs_itruncate_finish( */ if (fork == XFS_DATA_FORK) { xfs_isize_check(mp, ip, new_size); - ip->i_d.di_size = new_size; + /* + * If we are not changing the file size then do + * not update the on-disk file size - we may be + * called from xfs_inactive_free_eofblocks(). If we + * update the on-disk file size and then the system + * crashes before the contents of the file are + * flushed to disk then the files may be full of + * holes (ie NULL files bug). + */ + if (ip->i_size != new_size) { + ip->i_d.di_size = new_size; + ip->i_size = new_size; + } } xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); ASSERT((new_size != 0) || @@ -1802,7 +1829,7 @@ xfs_igrow_start( ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); - ASSERT(new_size > ip->i_d.di_size); + ASSERT(new_size > ip->i_size); /* * Zero any pages that may have been created by @@ -1810,7 +1837,7 @@ xfs_igrow_start( * and any blocks between the old and new file sizes. */ error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, - ip->i_d.di_size); + ip->i_size); return error; } @@ -1834,13 +1861,14 @@ xfs_igrow_finish( ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); ASSERT(ip->i_transp == tp); - ASSERT(new_size > ip->i_d.di_size); + ASSERT(new_size > ip->i_size); /* * Update the file size. Update the inode change timestamp * if change_flag set. */ ip->i_d.di_size = new_size; + ip->i_size = new_size; if (change_flag) xfs_ichgtime(ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -2323,7 +2351,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); - ASSERT((ip->i_d.di_size == 0) || + ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); ASSERT(ip->i_d.di_nblocks == 0); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 699960f3459..cfe7b58c453 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -287,6 +287,7 @@ typedef struct xfs_inode { struct xfs_inode *i_cnext; /* cluster hash link forward */ struct xfs_inode *i_cprev; /* cluster hash link backward */ + xfs_fsize_t i_size; /* in-memory size */ /* Trace buffers per inode. */ #ifdef XFS_BMAP_TRACE struct ktrace *i_xtrace; /* inode extent list trace */ @@ -305,6 +306,8 @@ typedef struct xfs_inode { #endif } xfs_inode_t; +#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ + (ip)->i_size : (ip)->i_d.di_size; /* * i_flags helper functions diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c index 06d710c9ce4..81548ec72ba 100644 --- a/fs/xfs/xfs_iocore.c +++ b/fs/xfs/xfs_iocore.c @@ -52,7 +52,7 @@ STATIC xfs_fsize_t xfs_size_fn( xfs_inode_t *ip) { - return (ip->i_d.di_size); + return XFS_ISIZE(ip); } STATIC int diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index cde70e89544..3f2b9f2a7b9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -458,7 +458,7 @@ xfs_iomap_write_direct( extsz = ip->i_d.di_extsize; } - isize = ip->i_d.di_size; + isize = ip->i_size; if (io->io_new_size > isize) isize = io->io_new_size; @@ -524,7 +524,7 @@ xfs_iomap_write_direct( xfs_trans_ihold(tp, ip); bmapi_flag = XFS_BMAPI_WRITE; - if ((flags & BMAPI_DIRECT) && (offset < ip->i_d.di_size || extsz)) + if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) bmapi_flag |= XFS_BMAPI_PREALLOC; /* @@ -676,7 +676,7 @@ xfs_iomap_write_delay( offset_fsb = XFS_B_TO_FSBT(mp, offset); retry: - isize = ip->i_d.di_size; + isize = ip->i_size; if (io->io_new_size > isize) isize = io->io_new_size; @@ -817,7 +817,7 @@ xfs_iomap_write_allocate( * we dropped the ilock in the interim. */ - end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size); + end_fsb = XFS_B_TO_FSB(mp, ip->i_size); xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); last_block = XFS_FILEOFF_MAX(last_block, end_fsb); diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 3ce204a524b..df441ee936b 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -22,6 +22,7 @@ typedef enum { /* iomap_flags values */ + IOMAP_READ = 0, /* mapping for a read */ IOMAP_EOF = 0x01, /* mapping contains EOF */ IOMAP_HOLE = 0x02, /* mapping covers a hole */ IOMAP_DELAY = 0x04, /* mapping covers delalloc region */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 6e49bd36246..e17be3b647b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -133,7 +133,7 @@ xfs_getattr( if (!(flags & ATTR_LAZY)) xfs_ilock(ip, XFS_ILOCK_SHARED); - vap->va_size = ip->i_d.di_size; + vap->va_size = XFS_ISIZE(ip); if (vap->va_mask == XFS_AT_SIZE) goto all_done; @@ -496,7 +496,7 @@ xfs_setattr( if (mask & XFS_AT_SIZE) { /* Short circuit the truncate case for zero length files */ if ((vap->va_size == 0) && - (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) { + (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) { xfs_iunlock(ip, XFS_ILOCK_EXCL); lock_flags &= ~XFS_ILOCK_EXCL; if (mask & XFS_AT_CTIME) @@ -614,7 +614,7 @@ xfs_setattr( */ if (mask & XFS_AT_SIZE) { code = 0; - if ((vap->va_size > ip->i_d.di_size) && + if ((vap->va_size > ip->i_size) && (flags & ATTR_NOSIZETOK) == 0) { code = xfs_igrow_start(ip, vap->va_size, credp); } @@ -654,10 +654,10 @@ xfs_setattr( * Truncate file. Must have write permission and not be a directory. */ if (mask & XFS_AT_SIZE) { - if (vap->va_size > ip->i_d.di_size) { + if (vap->va_size > ip->i_size) { xfs_igrow_finish(tp, ip, vap->va_size, !(flags & ATTR_DMI)); - } else if ((vap->va_size <= ip->i_d.di_size) || + } else if ((vap->va_size <= ip->i_size) || ((vap->va_size == 0) && ip->i_d.di_nextents)) { /* * signal a sync transaction unless @@ -1221,7 +1221,7 @@ xfs_inactive_free_eofblocks( * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. */ - end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size)); + end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); map_len = last_fsb - end_fsb; if (map_len <= 0) @@ -1258,7 +1258,7 @@ xfs_inactive_free_eofblocks( */ xfs_ilock(ip, XFS_IOLOCK_EXCL); error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, - ip->i_d.di_size); + ip->i_size); if (error) { xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -1282,7 +1282,7 @@ xfs_inactive_free_eofblocks( xfs_trans_ihold(tp, ip); error = xfs_itruncate_finish(&tp, ip, - ip->i_d.di_size, + ip->i_size, XFS_DATA_FORK, 0); /* @@ -1568,7 +1568,7 @@ xfs_release( if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 || + ((ip->i_size > 0) || (VN_CACHED(vp) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & @@ -1629,8 +1629,8 @@ xfs_inactive( * only one with a reference to the inode. */ truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) || - (ip->i_delayed_blks > 0)) && + ((ip->i_d.di_size != 0) || (ip->i_size != 0) || + (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); mp = ip->i_mount; @@ -1648,7 +1648,7 @@ xfs_inactive( if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 || + ((ip->i_size > 0) || (VN_CACHED(vp) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS) && (!(ip->i_d.di_flags & @@ -4055,14 +4055,14 @@ xfs_alloc_file_space( allocatesize_fsb = XFS_B_TO_FSB(mp, count); /* Generate a DMAPI event if needed. */ - if (alloc_type != 0 && offset < ip->i_d.di_size && + if (alloc_type != 0 && offset < ip->i_size && (attr_flags&ATTR_DMI) == 0 && DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) { xfs_off_t end_dmi_offset; end_dmi_offset = offset+len; - if (end_dmi_offset > ip->i_d.di_size) - end_dmi_offset = ip->i_d.di_size; + if (end_dmi_offset > ip->i_size) + end_dmi_offset = ip->i_size; error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip), offset, end_dmi_offset - offset, 0, NULL); @@ -4318,11 +4318,11 @@ xfs_free_file_space( end_dmi_offset = offset + len; endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset); - if (offset < ip->i_d.di_size && + if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 && DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) { - if (end_dmi_offset > ip->i_d.di_size) - end_dmi_offset = ip->i_d.di_size; + if (end_dmi_offset > ip->i_size) + end_dmi_offset = ip->i_size; error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, offset, end_dmi_offset - offset, AT_DELAY_FLAG(attr_flags), NULL); @@ -4541,7 +4541,7 @@ xfs_change_file_space( bf->l_start += offset; break; case 2: /*SEEK_END*/ - bf->l_start += ip->i_d.di_size; + bf->l_start += ip->i_size; break; default: return XFS_ERROR(EINVAL); @@ -4558,7 +4558,7 @@ xfs_change_file_space( bf->l_whence = 0; startoffset = bf->l_start; - fsize = ip->i_d.di_size; + fsize = ip->i_size; /* * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve -- cgit v1.2.3-70-g09d2 From f7c66ce3f70d8417de0cfb481ca4e5430382ec5d Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Tue, 8 May 2007 13:50:19 +1000 Subject: [XFS] Add lockdep support for XFS SGI-PV: 963965 SGI-Modid: xfs-linux-melb:xfs-kern:28485a Signed-off-by: Lachlan McIlroy Signed-off-by: David Chinner Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/mrlock.h | 12 ++++++++++ fs/xfs/xfs_iget.c | 15 ++++++------ fs/xfs/xfs_inode.h | 60 ++++++++++++++++++++++++++++++++++++----------- fs/xfs/xfs_vfsops.c | 2 +- fs/xfs/xfs_vnodeops.c | 27 ++++++++++++++++----- 5 files changed, 88 insertions(+), 28 deletions(-) (limited to 'fs/xfs/xfs_inode.h') diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h index af168a1a98c..c110bb00266 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/linux-2.6/mrlock.h @@ -43,6 +43,18 @@ static inline void mrupdate(mrlock_t *mrp) mrp->mr_writer = 1; } +static inline void mraccess_nested(mrlock_t *mrp, int subclass) +{ + down_read_nested(&mrp->mr_lock, subclass); +} + +static inline void mrupdate_nested(mrlock_t *mrp, int subclass) +{ + down_write_nested(&mrp->mr_lock, subclass); + mrp->mr_writer = 1; +} + + static inline int mrtryaccess(mrlock_t *mrp) { return down_read_trylock(&mrp->mr_lock); diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index c1c89dac19c..114433a22ba 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -879,17 +879,17 @@ xfs_ilock(xfs_inode_t *ip, (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { - mrupdate(&ip->i_iolock); + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); } else if (lock_flags & XFS_IOLOCK_SHARED) { - mraccess(&ip->i_iolock); + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); } if (lock_flags & XFS_ILOCK_EXCL) { - mrupdate(&ip->i_lock); + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); } else if (lock_flags & XFS_ILOCK_SHARED) { - mraccess(&ip->i_lock); + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); } xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address); } @@ -923,7 +923,7 @@ xfs_ilock_nowait(xfs_inode_t *ip, (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); iolocked = 0; if (lock_flags & XFS_IOLOCK_EXCL) { @@ -983,7 +983,8 @@ xfs_iunlock(xfs_inode_t *ip, (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY | + XFS_LOCK_DEP_MASK)) == 0); ASSERT(lock_flags != 0); if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index cfe7b58c453..f75afecef8e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -382,26 +382,58 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) /* * Flags for inode locking. + * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) + * 1<<16 - 1<<32-1 -- lockdep annotation (integers) */ -#define XFS_IOLOCK_EXCL 0x001 -#define XFS_IOLOCK_SHARED 0x002 -#define XFS_ILOCK_EXCL 0x004 -#define XFS_ILOCK_SHARED 0x008 -#define XFS_IUNLOCK_NONOTIFY 0x010 -/* XFS_IOLOCK_NESTED 0x020 */ -#define XFS_EXTENT_TOKEN_RD 0x040 -#define XFS_SIZE_TOKEN_RD 0x080 +#define XFS_IOLOCK_EXCL (1<<0) +#define XFS_IOLOCK_SHARED (1<<1) +#define XFS_ILOCK_EXCL (1<<2) +#define XFS_ILOCK_SHARED (1<<3) +#define XFS_IUNLOCK_NONOTIFY (1<<4) +/* #define XFS_IOLOCK_NESTED (1<<5) */ +#define XFS_EXTENT_TOKEN_RD (1<<6) +#define XFS_SIZE_TOKEN_RD (1<<7) #define XFS_EXTSIZE_RD (XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD) -#define XFS_WILLLEND 0x100 /* Always acquire tokens for lending */ +#define XFS_WILLLEND (1<<8) /* Always acquire tokens for lending */ #define XFS_EXTENT_TOKEN_WR (XFS_EXTENT_TOKEN_RD | XFS_WILLLEND) #define XFS_SIZE_TOKEN_WR (XFS_SIZE_TOKEN_RD | XFS_WILLLEND) #define XFS_EXTSIZE_WR (XFS_EXTSIZE_RD | XFS_WILLLEND) -/* XFS_SIZE_TOKEN_WANT 0x200 */ +/* TODO:XFS_SIZE_TOKEN_WANT (1<<9) */ -#define XFS_LOCK_MASK \ - (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \ - XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \ - XFS_WILLLEND) +#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ + | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD \ + | XFS_WILLLEND) + +/* + * Flags for lockdep annotations. + * + * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes + * (ie directory operations that require locking a directory inode and + * an entry inode). The first inode gets locked with this flag so it + * gets a lockdep subclass of 1 and the second lock will have a lockdep + * subclass of 0. + * + * XFS_I[O]LOCK_INUMORDER - for locking several inodes at the some time + * with xfs_lock_inodes(). This flag is used as the starting subclass + * and each subsequent lock acquired will increment the subclass by one. + * So the first lock acquired will have a lockdep subclass of 2, the + * second lock will have a lockdep subclass of 3, and so on. + */ +#define XFS_IOLOCK_SHIFT 16 +#define XFS_IOLOCK_PARENT (1 << XFS_IOLOCK_SHIFT) +#define XFS_IOLOCK_INUMORDER (2 << XFS_IOLOCK_SHIFT) + +#define XFS_ILOCK_SHIFT 24 +#define XFS_ILOCK_PARENT (1 << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_INUMORDER (2 << XFS_ILOCK_SHIFT) + +#define XFS_IOLOCK_DEP_MASK 0x00ff0000 +#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) + +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) /* * Flags for xfs_iflush() diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 2e830e7f52e..65c561201cb 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -696,7 +696,7 @@ xfs_unmount_flush( bhv_vnode_t *rvp = XFS_ITOV(rip); int error; - xfs_ilock(rip, XFS_ILOCK_EXCL); + xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); xfs_iflock(rip); /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index e2c9bbc2782..de17aed578f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1947,7 +1947,7 @@ xfs_create( goto error_return; } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); XFS_BMAP_INIT(&free_list, &first_block); @@ -2141,7 +2141,7 @@ xfs_lock_dir_and_entry( attempts = 0; again: - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); e_inum = ip->i_ino; @@ -2209,6 +2209,21 @@ int xfs_lots_retries; int xfs_lock_delays; #endif +/* + * Bump the subclass so xfs_lock_inodes() acquires each lock with + * a different value + */ +static inline int +xfs_lock_inumorder(int lock_mode, int subclass) +{ + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + lock_mode |= (subclass + XFS_IOLOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) + lock_mode |= (subclass + XFS_ILOCK_INUMORDER) << XFS_ILOCK_SHIFT; + + return lock_mode; +} + /* * The following routine will lock n inodes in exclusive mode. * We assume the caller calls us with the inodes in i_ino order. @@ -2276,7 +2291,7 @@ again: * that is in the AIL. */ ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], lock_mode)) { + if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { attempts++; /* @@ -2311,7 +2326,7 @@ again: goto again; } } else { - xfs_ilock(ips[i], lock_mode); + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); } } @@ -2845,7 +2860,7 @@ xfs_mkdir( goto error_return; } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); /* * Check for directory link count overflow. @@ -3399,7 +3414,7 @@ xfs_symlink( goto error_return; } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); /* * Check whether the directory allows new symlinks or not. -- cgit v1.2.3-70-g09d2