From a363f0c2030cb9781e7e458f4a9e354b6c43d7ce Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 11 Jan 2011 10:13:53 +1100 Subject: xfs: ensure sync write errors are returned xfs_file_aio_write() only returns the error from synchronous flushing of the data and inode if error == 0. At the point where error is being checked, it is guaranteed to be > 0. Therefore any errors returned by the data or fsync flush will never be returned. Fix the checks so we overwrite the current error once and only if an error really occurred. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_file.c | 49 +++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 26 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ba8ad422a16..10b7fb4807a 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -574,7 +574,7 @@ xfs_file_aio_write( struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0, error = 0; + ssize_t ret = 0; int ioflags = 0; xfs_fsize_t isize, new_size; int iolock; @@ -590,9 +590,9 @@ xfs_file_aio_write( if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); - if (error) - return error; + ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + if (ret) + return ret; count = ocount; if (count == 0) @@ -616,9 +616,9 @@ relock: xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); start: - error = -generic_write_checks(file, &pos, &count, + ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (error) { + if (ret) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; } @@ -660,8 +660,8 @@ start: */ if (pos > ip->i_size) { - error = xfs_zero_eof(ip, pos, ip->i_size); - if (error) { + ret = -xfs_zero_eof(ip, pos, ip->i_size); + if (ret) { xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out_unlock_internal; } @@ -674,8 +674,8 @@ start: * by root. This keeps people from modifying setuid and * setgid binaries. */ - error = -file_remove_suid(file); - if (unlikely(error)) + ret = file_remove_suid(file); + if (unlikely(ret)) goto out_unlock_internal; /* We can write back this queue in page reclaim */ @@ -684,10 +684,10 @@ start: if ((ioflags & IO_ISDIRECT)) { if (mapping->nrpages) { WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(ip, + ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); - if (error) + if (ret) goto out_unlock_internal; } @@ -720,24 +720,22 @@ start: } } else { int enospc = 0; - ssize_t ret2 = 0; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); - ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, + ret = generic_file_buffered_write(iocb, iovp, nr_segs, pos, &iocb->ki_pos, count, ret); /* * if we just got an ENOSPC, flush the inode now we * aren't holding any page locks and retry *once* */ - if (ret2 == -ENOSPC && !enospc) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (error) + if (ret == -ENOSPC && !enospc) { + ret = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (ret) goto out_unlock_internal; enospc = 1; goto write_retry; } - ret = ret2; } current->backing_dev_info = NULL; @@ -753,7 +751,6 @@ write_retry: xfs_iunlock(ip, XFS_ILOCK_EXCL); } - error = -ret; if (ret <= 0) goto out_unlock_internal; @@ -762,23 +759,23 @@ write_retry: /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; - int error2; + int error, error2; xfs_iunlock(ip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - error2 = filemap_write_and_wait_range(mapping, pos, end); - if (!error) - error = error2; + error = filemap_write_and_wait_range(mapping, pos, end); if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); error2 = -xfs_file_fsync(file, (file->f_flags & __O_SYNC) ? 0 : 1); - if (!error) - error = error2; + if (error) + ret = error; + else if (error2) + ret = error2; } out_unlock_internal: @@ -800,7 +797,7 @@ write_retry: out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); - return -error; + return ret; } STATIC int -- cgit v1.2.3-70-g09d2 From edafb6da9aa725e4de5fe758fe81644b6167f9a2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 11 Jan 2011 10:14:06 +1100 Subject: xfs: factor common post-write isize handling code Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_file.c | 54 +++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 26 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 10b7fb4807a..b3915bf2577 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -321,6 +321,30 @@ xfs_file_splice_read( return ret; } +STATIC void +xfs_aio_write_isize_update( + struct inode *inode, + loff_t *ppos, + ssize_t bytes_written) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t isize = i_size_read(inode); + + if (bytes_written > 0) + XFS_STATS_ADD(xs_write_bytes, bytes_written); + + if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && + *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } +} + STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, @@ -331,7 +355,7 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t isize, new_size; + xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -355,19 +379,8 @@ xfs_file_splice_write( trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) - *ppos = isize; - if (*ppos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_aio_write_isize_update(inode, ppos, ret); if (ip->i_new_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -576,7 +589,7 @@ xfs_file_aio_write( struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; int ioflags = 0; - xfs_fsize_t isize, new_size; + xfs_fsize_t new_size; int iolock; size_t ocount = 0, count; int need_i_mutex; @@ -740,22 +753,11 @@ write_retry: current->backing_dev_info = NULL; - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) - iocb->ki_pos = isize; - - if (iocb->ki_pos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (iocb->ki_pos > ip->i_size) - ip->i_size = iocb->ki_pos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); if (ret <= 0) goto out_unlock_internal; - XFS_STATS_ADD(xs_write_bytes, ret); - /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; -- cgit v1.2.3-70-g09d2 From 4c5cfd1b4157fb75d43b44a147c2feba6422fc4f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 11 Jan 2011 10:14:16 +1100 Subject: xfs: factor post-write newsize updates Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_file.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index b3915bf2577..c47d7dc0a30 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -345,6 +345,25 @@ xfs_aio_write_isize_update( } } +/* + * If this was a direct or synchronous I/O that failed (such as ENOSPC) then + * part of the I/O may have been written to disk before the error occured. In + * this case the on-disk file size may have been adjusted beyond the in-memory + * file size and now needs to be truncated back. + */ +STATIC void +xfs_aio_write_newsize_update( + struct xfs_inode *ip) +{ + if (ip->i_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } +} + STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, @@ -381,14 +400,7 @@ xfs_file_splice_write( ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); xfs_aio_write_isize_update(inode, ppos, ret); - - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_aio_write_newsize_update(ip); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -781,20 +793,7 @@ write_retry: } out_unlock_internal: - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - /* - * If this was a direct or synchronous I/O that failed (such - * as ENOSPC) then part of the I/O may have been written to - * disk before the error occured. In this case the on-disk - * file size may have been adjusted beyond the in-memory file - * size and now needs to be truncated back. - */ - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_aio_write_newsize_update(ip); xfs_iunlock(ip, iolock); out_unlock_mutex: if (need_i_mutex) -- cgit v1.2.3-70-g09d2 From 487f84f3f80bc6f00c59725e822653d3ec174b85 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 12 Jan 2011 11:37:10 +1100 Subject: xfs: introduce xfs_rw_lock() helpers for locking the inode We need to obtain the i_mutex, i_iolock and i_ilock during the read and write paths. Add a set of wrapper functions to neatly encapsulate the lock ordering and shared/exclusive semantics to make the locking easier to follow and get right. Note that this changes some of the exclusive locking serialisation in that serialisation will occur against the i_mutex instead of the XFS_IOLOCK_EXCL. This does not change any behaviour, and it is arguably more efficient to use the mutex for such serialisation than the rw_sem. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_file.c | 131 +++++++++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 51 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index c47d7dc0a30..b5e13fbb738 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -40,6 +40,40 @@ static const struct vm_operations_struct xfs_file_vm_ops; +/* + * Locking primitives for read and write IO paths to ensure we consistently use + * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + */ +static inline void +xfs_rw_ilock( + struct xfs_inode *ip, + int type) +{ + if (type & XFS_IOLOCK_EXCL) + mutex_lock(&VFS_I(ip)->i_mutex); + xfs_ilock(ip, type); +} + +static inline void +xfs_rw_iunlock( + struct xfs_inode *ip, + int type) +{ + xfs_iunlock(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +static inline void +xfs_rw_ilock_demote( + struct xfs_inode *ip, + int type) +{ + xfs_ilock_demote(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + /* * xfs_iozero * @@ -262,22 +296,21 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (iocb->ki_pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); + if (ret) { + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } } - mutex_unlock(&inode->i_mutex); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; - } - } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } else + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); @@ -285,7 +318,7 @@ xfs_file_aio_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -309,7 +342,7 @@ xfs_file_splice_read( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - xfs_ilock(ip, XFS_IOLOCK_SHARED); + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_splice_read(ip, count, *ppos, ioflags); @@ -317,7 +350,7 @@ xfs_file_splice_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -338,10 +371,10 @@ xfs_aio_write_isize_update( *ppos = isize; if (*ppos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); if (*ppos > ip->i_size) ip->i_size = *ppos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); } } @@ -356,14 +389,22 @@ xfs_aio_write_newsize_update( struct xfs_inode *ip) { if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); ip->i_new_size = 0; if (ip->i_d.di_size > ip->i_size) ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); } } +/* + * xfs_file_splice_write() does not use xfs_rw_ilock() because + * generic_file_splice_write() takes the i_mutex itself. This, in theory, + * couuld cause lock inversions between the aio_write path and the splice path + * if someone is doing concurrent splice(2) based writes and write(2) based + * writes to the same inode. The only real way to fix this is to re-implement + * the generic code here with correct locking orders. + */ STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, @@ -604,7 +645,6 @@ xfs_file_aio_write( xfs_fsize_t new_size; int iolock; size_t ocount = 0, count; - int need_i_mutex; XFS_STATS_INC(xs_write_calls); @@ -631,21 +671,17 @@ xfs_file_aio_write( relock: if (ioflags & IO_ISDIRECT) { iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; } else { iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); } - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); - start: + xfs_rw_ilock(ip, XFS_ILOCK_EXCL|iolock); ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (ret) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock); + return ret; } if (ioflags & IO_ISDIRECT) { @@ -654,16 +690,20 @@ start: mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock); return XFS_ERROR(-EINVAL); } - if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); + /* + * For direct I/O, if there are cached pages or we're extending + * the file, we need IOLOCK_EXCL until we're sure the bytes at + * the new EOF have been zeroed and/or the cached pages are + * flushed out. Upgrade the I/O lock and start again. + */ + if (iolock != XFS_IOLOCK_EXCL && + (mapping->nrpages || pos > ip->i_size)) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); goto start; } } @@ -687,11 +727,11 @@ start: if (pos > ip->i_size) { ret = -xfs_zero_eof(ip, pos, ip->i_size); if (ret) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); goto out_unlock_internal; } } - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the @@ -708,7 +748,7 @@ start: if ((ioflags & IO_ISDIRECT)) { if (mapping->nrpages) { - WARN_ON(need_i_mutex == 0); + WARN_ON(iolock != XFS_IOLOCK_EXCL); ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); @@ -716,13 +756,10 @@ start: goto out_unlock_internal; } - if (need_i_mutex) { + if (iolock == XFS_IOLOCK_EXCL) { /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); - + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); @@ -740,7 +777,7 @@ start: count -= ret; ioflags &= ~IO_ISDIRECT; - xfs_iunlock(ip, iolock); + xfs_rw_iunlock(ip, iolock); goto relock; } } else { @@ -775,14 +812,9 @@ write_retry: loff_t end = pos + ret - 1; int error, error2; - xfs_iunlock(ip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - + xfs_rw_iunlock(ip, iolock); error = filemap_write_and_wait_range(mapping, pos, end); - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, iolock); + xfs_rw_ilock(ip, iolock); error2 = -xfs_file_fsync(file, (file->f_flags & __O_SYNC) ? 0 : 1); @@ -794,10 +826,7 @@ write_retry: out_unlock_internal: xfs_aio_write_newsize_update(ip); - xfs_iunlock(ip, iolock); - out_unlock_mutex: - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); + xfs_rw_iunlock(ip, iolock); return ret; } -- cgit v1.2.3-70-g09d2 From f0d26e860b6c496464c5c8165d7df08dabde01fa Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 11 Jan 2011 10:15:36 +1100 Subject: xfs: split direct IO write path from xfs_file_aio_write The current xfs_file_aio_write code is a mess of locking shenanigans to handle the different locking requirements of buffered and direct IO. Start to clean this up by disentangling the direct IO path from the mess. This also removes the failed direct IO fallback path to buffered IO. XFS handles all direct IO cases without needing to fall back to buffered IO, so we can safely remove this unused path. This greatly simplifies the logic and locking needed in the write path. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_file.c | 179 ++++++++++++++++++++++++++++---------------- 1 file changed, 116 insertions(+), 63 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index b5e13fbb738..00661fd21fc 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -628,6 +628,116 @@ out_lock: return error; } +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By spearating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. + * + * Returns with locks held indicated by @iolock and errors indicated by + * negative return values. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + xfs_fsize_t new_size; + size_t count = ocount; + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + *iolock = 0; + if ((pos & target->bt_smask) || (count & target->bt_smask)) + return -XFS_ERROR(EINVAL); + + /* + * For direct I/O, if there are cached pages or we're extending + * the file, we need IOLOCK_EXCL until we're sure the bytes at + * the new EOF have been zeroed and/or the cached pages are + * flushed out. + */ + if (mapping->nrpages || pos > ip->i_size) + *iolock = XFS_IOLOCK_EXCL; + else + *iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + + ret = generic_write_checks(file, &pos, &count, + S_ISBLK(inode->i_mode)); + if (ret) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = 0; + return ret; + } + + new_size = pos + count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; + + if (likely(!(file->f_mode & FMODE_NOCMTIME))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we have a couple of + * things to do. First, if there is already space allocated we need to + * either create holes or zero the disk or ... + * + * If there is a page where the previous size lands, we need to zero it + * out up to the new size. + */ + if (pos > ip->i_size) { + ret = -xfs_zero_eof(ip, pos, ip->i_size); + if (ret) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + return ret; + } + } + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + ret = file_remove_suid(file); + if (unlikely(ret)) + return ret; + + if (mapping->nrpages) { + WARN_ON(*iolock != XFS_IOLOCK_EXCL); + ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, + FI_REMAPF_LOCKED); + if (ret) + return ret; + } + + if (*iolock == XFS_IOLOCK_EXCL) { + /* demote the lock now the cached pages are gone */ + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + *iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_direct_write(iocb, iovp, + &nr_segs, pos, &iocb->ki_pos, count, ocount); + + /* No fallback to buffered IO on errors for XFS. */ + ASSERT(ret < 0 || ret == count); + return ret; +} + STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, @@ -670,12 +780,12 @@ xfs_file_aio_write( relock: if (ioflags & IO_ISDIRECT) { - iolock = XFS_IOLOCK_SHARED; - } else { - iolock = XFS_IOLOCK_EXCL; + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); + goto done_io; } + iolock = XFS_IOLOCK_EXCL; -start: xfs_rw_ilock(ip, XFS_ILOCK_EXCL|iolock); ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); @@ -684,30 +794,6 @@ start: return ret; } - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } - - /* - * For direct I/O, if there are cached pages or we're extending - * the file, we need IOLOCK_EXCL until we're sure the bytes at - * the new EOF have been zeroed and/or the cached pages are - * flushed out. Upgrade the I/O lock and start again. - */ - if (iolock != XFS_IOLOCK_EXCL && - (mapping->nrpages || pos > ip->i_size)) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - goto start; - } - } - new_size = pos + count; if (new_size > ip->i_size) ip->i_new_size = new_size; @@ -746,41 +832,7 @@ start: /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - if ((ioflags & IO_ISDIRECT)) { - if (mapping->nrpages) { - WARN_ON(iolock != XFS_IOLOCK_EXCL); - ret = -xfs_flushinval_pages(ip, - (pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - if (ret) - goto out_unlock_internal; - } - - if (iolock == XFS_IOLOCK_EXCL) { - /* demote the lock now the cached pages are gone */ - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; - } - - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); - - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); - - pos += ret; - count -= ret; - - ioflags &= ~IO_ISDIRECT; - xfs_rw_iunlock(ip, iolock); - goto relock; - } - } else { + if (!(ioflags & IO_ISDIRECT)) { int enospc = 0; write_retry: @@ -802,6 +854,7 @@ write_retry: current->backing_dev_info = NULL; +done_io: xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); if (ret <= 0) -- cgit v1.2.3-70-g09d2 From 637bbc75d9fda57c7bc77ce5ee37e29a77a0520d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 11 Jan 2011 10:17:30 +1100 Subject: xfs: split buffered IO write path from xfs_file_aio_write Complete the split of the different write IO paths by splitting the buffered IO write path out of xfs_file_aio_write(). This makes the different mechanisms of the write patchs easier to follow. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_file.c | 146 +++++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 77 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 00661fd21fc..e2bcf51d292 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -739,58 +739,31 @@ xfs_file_dio_aio_write( } STATIC ssize_t -xfs_file_aio_write( +xfs_file_buffered_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, - loff_t pos) + loff_t pos, + size_t ocount, + int *iolock) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0; - int ioflags = 0; + ssize_t ret; + int enospc = 0; xfs_fsize_t new_size; - int iolock; - size_t ocount = 0, count; - - XFS_STATS_INC(xs_write_calls); - - BUG_ON(iocb->ki_pos != pos); - - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - - ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); - if (ret) - return ret; - - count = ocount; - if (count == 0) - return 0; - - xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; + size_t count = ocount; -relock: - if (ioflags & IO_ISDIRECT) { - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); - goto done_io; - } - iolock = XFS_IOLOCK_EXCL; + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - xfs_rw_ilock(ip, XFS_ILOCK_EXCL|iolock); ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (ret) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL|iolock); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = 0; return ret; } @@ -798,67 +771,86 @@ relock: if (new_size > ip->i_size) ip->i_new_size = new_size; - if (likely(!(ioflags & IO_INVIS))) + if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); - /* - * If the offset is beyond the size of the file, we have a couple - * of things to do. First, if there is already space allocated - * we need to either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need - * to zero it out up to the new size. - */ - if (pos > ip->i_size) { ret = -xfs_zero_eof(ip, pos, ip->i_size); if (ret) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - goto out_unlock_internal; + return ret; } } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - /* - * If we're writing the file then make sure to clear the - * setuid and setgid bits if the process is not being run - * by root. This keeps people from modifying setuid and - * setgid binaries. - */ ret = file_remove_suid(file); if (unlikely(ret)) - goto out_unlock_internal; + return ret; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - if (!(ioflags & IO_ISDIRECT)) { - int enospc = 0; - write_retry: - trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); - ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); - /* - * if we just got an ENOSPC, flush the inode now we - * aren't holding any page locks and retry *once* - */ - if (ret == -ENOSPC && !enospc) { - ret = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (ret) - goto out_unlock_internal; - enospc = 1; - goto write_retry; - } + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_buffered_write(iocb, iovp, nr_segs, + pos, &iocb->ki_pos, count, ret); + /* + * if we just got an ENOSPC, flush the inode now we aren't holding any + * page locks and retry *once* + */ + if (ret == -ENOSPC && !enospc) { + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (ret) + return ret; + enospc = 1; + goto write_retry; } - current->backing_dev_info = NULL; + return ret; +} + +STATIC ssize_t +xfs_file_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int iolock; + size_t ocount = 0; + + XFS_STATS_INC(xs_write_calls); + + BUG_ON(iocb->ki_pos != pos); + + ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + if (ret) + return ret; + + if (ocount == 0) + return 0; + + xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (unlikely(file->f_flags & O_DIRECT)) + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); + else + ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); -done_io: xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); if (ret <= 0) - goto out_unlock_internal; + goto out_unlock; /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { @@ -877,7 +869,7 @@ done_io: ret = error2; } - out_unlock_internal: +out_unlock: xfs_aio_write_newsize_update(ip); xfs_rw_iunlock(ip, iolock); return ret; -- cgit v1.2.3-70-g09d2 From 4d8d15812fd9bc96d0da11467d23e0373feae933 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 11 Jan 2011 10:23:42 +1100 Subject: xfs: factor common write setup code The buffered IO and direct IO write paths share a common set of checks and limiting code prior to issuing the write. Factor that into a common helper function. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_file.c | 123 ++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 67 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index e2bcf51d292..5863dd8f448 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -628,6 +628,58 @@ out_lock: return error; } +/* + * Common pre-write limit and setup checks. + * + * Returns with iolock held according to @iolock. + */ +STATIC ssize_t +xfs_file_aio_write_checks( + struct file *file, + loff_t *pos, + size_t *count, + int *iolock) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t new_size; + int error = 0; + + error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); + if (error) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = 0; + return error; + } + + new_size = *pos + *count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; + + if (likely(!(file->f_mode & FMODE_NOCMTIME))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we need to zero any + * blocks that fall between the existing EOF and the start of this + * write. + */ + if (*pos > ip->i_size) + error = -xfs_zero_eof(ip, *pos, ip->i_size); + + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + return file_remove_suid(file); + +} + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -653,7 +705,6 @@ xfs_file_dio_aio_write( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; - xfs_fsize_t new_size; size_t count = ocount; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -674,45 +725,8 @@ xfs_file_dio_aio_write( *iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - ret = generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (ret) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); - *iolock = 0; - return ret; - } - - new_size = pos + count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; - - if (likely(!(file->f_mode & FMODE_NOCMTIME))) - file_update_time(file); - - /* - * If the offset is beyond the size of the file, we have a couple of - * things to do. First, if there is already space allocated we need to - * either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need to zero it - * out up to the new size. - */ - if (pos > ip->i_size) { - ret = -xfs_zero_eof(ip, pos, ip->i_size); - if (ret) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - return ret; - } - } - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - - /* - * If we're writing the file then make sure to clear the setuid and - * setgid bits if the process is not being run by root. This keeps - * people from modifying setuid and setgid binaries. - */ - ret = file_remove_suid(file); - if (unlikely(ret)) + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) return ret; if (mapping->nrpages) { @@ -753,38 +767,13 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; - xfs_fsize_t new_size; size_t count = ocount; *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - ret = generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (ret) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); - *iolock = 0; - return ret; - } - - new_size = pos + count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; - - if (likely(!(file->f_mode & FMODE_NOCMTIME))) - file_update_time(file); - - if (pos > ip->i_size) { - ret = -xfs_zero_eof(ip, pos, ip->i_size); - if (ret) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - return ret; - } - } - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - - ret = file_remove_suid(file); - if (unlikely(ret)) + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) return ret; /* We can write back this queue in page reclaim */ -- cgit v1.2.3-70-g09d2 From eda77982729b7170bdc9e8855f0682edf322d277 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 11 Jan 2011 10:22:40 +1100 Subject: xfs: serialise unaligned direct IOs When two concurrent unaligned, non-overlapping direct IOs are issued to the same block, the direct Io layer will race to zero the block. The result is that one of the concurrent IOs will overwrite data written by the other IO with zeros. This is demonstrated by the xfsqa test 240. To avoid this problem, serialise all unaligned direct IOs to an inode with a big hammer. We need a big hammer approach as we need to serialise AIO as well, so we can't just block writes on locks. Hence, the big hammer is calling xfs_ioend_wait() while holding out other unaligned direct IOs from starting. We don't bother trying to serialised aligned vs unaligned IOs as they are overlapping IO and the result of concurrent overlapping IOs is undefined - the result of either IO is a valid result so we let them race. Hence we only penalise unaligned IO, which already has a major overhead compared to aligned IO so this isn't a major problem. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_file.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 5863dd8f448..ef51eb43e13 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -684,9 +684,24 @@ xfs_file_aio_write_checks( * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. - * By spearating it from the buffered write path we remove all the tricky to + * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * + * If there are cached pages or we're extending the file, we need IOLOCK_EXCL + * until we're sure the bytes at the new EOF have been zeroed and/or the cached + * pages are flushed out. + * + * In most cases the direct IO writes will be done holding IOLOCK_SHARED + * allowing them to be done in parallel with reads and other direct IO writes. + * However, if the IO is not aligned to filesystem blocks, the direct IO layer + * needs to do sub-block zeroing and that requires serialisation against other + * direct IOs to the same block. In this case we need to serialise the + * submission of the unaligned IOs so that we don't get racing block zeroing in + * the dio layer. To avoid the problem with aio, we also need to wait for + * outstanding IOs to complete so that unwritten extent conversion is completed + * before we try to map the overlapping block. This is currently implemented by + * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ @@ -706,6 +721,7 @@ xfs_file_dio_aio_write( struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; size_t count = ocount; + int unaligned_io = 0; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -713,13 +729,10 @@ xfs_file_dio_aio_write( if ((pos & target->bt_smask) || (count & target->bt_smask)) return -XFS_ERROR(EINVAL); - /* - * For direct I/O, if there are cached pages or we're extending - * the file, we need IOLOCK_EXCL until we're sure the bytes at - * the new EOF have been zeroed and/or the cached pages are - * flushed out. - */ - if (mapping->nrpages || pos > ip->i_size) + if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + unaligned_io = 1; + + if (unaligned_io || mapping->nrpages || pos > ip->i_size) *iolock = XFS_IOLOCK_EXCL; else *iolock = XFS_IOLOCK_SHARED; @@ -737,8 +750,13 @@ xfs_file_dio_aio_write( return ret; } - if (*iolock == XFS_IOLOCK_EXCL) { - /* demote the lock now the cached pages are gone */ + /* + * If we are doing unaligned IO, wait for all other IO to drain, + * otherwise demote the lock if we had to flush cached pages + */ + if (unaligned_io) + xfs_ioend_wait(ip); + else if (*iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); *iolock = XFS_IOLOCK_SHARED; } -- cgit v1.2.3-70-g09d2 From c58efdb442bb49dea1d148f207560c41918c1bf4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 4 Jan 2011 04:49:29 +0000 Subject: xfs: ensure log covering transactions are synchronous To ensure the log is covered and the filesystem idles correctly, we need to ensure that dummy transactions hit the disk and do not stay pinned in memory. If the superblock is pinned in memory, it can't be flushed so the log covering cannot make progress. The result is dependent on timing - more oftent han not we continue to issues a log covering transaction every 36s rather than idling after ~90s. Fix this by making the log covering transaction synchronous. To avoid additional log force from xfssyncd, make the log covering transaction take the place of the existing log force in the xfssyncd background sync process. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_super.c | 2 +- fs/xfs/linux-2.6/xfs_sync.c | 11 ++++++----- fs/xfs/xfs_fsops.c | 10 +++++----- fs/xfs/xfs_fsops.h | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index c51faaa5e29..af32f375ca9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1413,7 +1413,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return -xfs_fs_log_dummy(mp, SYNC_WAIT); + return -xfs_fs_log_dummy(mp); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a02480de975..e22f0057d21 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -362,7 +362,7 @@ xfs_quiesce_data( /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); + error2 = xfs_fs_log_dummy(mp); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -503,13 +503,14 @@ xfs_sync_worker( int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_log_force(mp, 0); - xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ - error = xfs_qm_sync(mp, SYNC_TRYLOCK); if (mp->m_super->s_frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp, 0); + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + xfs_reclaim_inodes(mp, 0); + error = xfs_qm_sync(mp, SYNC_TRYLOCK); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f56d30e8040..cec89dd5d7d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -612,12 +612,13 @@ out: * * We cannot use an inode here for this - that will push dirty state back up * into the VFS and then periodic inode flushing will prevent log covering from - * making progress. Hence we log a field in the superblock instead. + * making progress. Hence we log a field in the superblock instead and use a + * synchronous transaction to ensure the superblock is immediately unpinned + * and can be written back. */ int xfs_fs_log_dummy( - xfs_mount_t *mp, - int flags) + xfs_mount_t *mp) { xfs_trans_t *tp; int error; @@ -632,8 +633,7 @@ xfs_fs_log_dummy( /* log the UUID because it is an unchanging field */ xfs_mod_sb(tp, XFS_SB_UUID); - if (flags & SYNC_WAIT) - xfs_trans_set_sync(tp); + xfs_trans_set_sync(tp); return xfs_trans_commit(tp, 0); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index a786c5212c1..1b6a98b6688 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); +extern int xfs_fs_log_dummy(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ -- cgit v1.2.3-70-g09d2 From a46db60834883c1c8c665d7fcc7b4ab66f5966fc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Jan 2011 13:02:04 +0000 Subject: xfs: add FITRIM support Allow manual discards from userspace using the FITRIM ioctl. This is not intended to be run during normal workloads, as the freepsace btree walks can cause large performance degradation. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/Makefile | 1 + fs/xfs/linux-2.6/xfs_discard.c | 191 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_discard.h | 8 ++ fs/xfs/linux-2.6/xfs_ioctl.c | 3 + fs/xfs/linux-2.6/xfs_trace.h | 33 +++++++ fs/xfs/xfs_alloc.c | 10 +-- fs/xfs/xfs_alloc.h | 25 ++++-- 7 files changed, 259 insertions(+), 12 deletions(-) create mode 100644 fs/xfs/linux-2.6/xfs_discard.c create mode 100644 fs/xfs/linux-2.6/xfs_discard.h (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 0dce969d6ca..faca4499709 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ kmem.o \ xfs_aops.o \ xfs_buf.o \ + xfs_discard.o \ xfs_export.o \ xfs_file.o \ xfs_fs_subr.o \ diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c new file mode 100644 index 00000000000..05201ae719e --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_discard.h" +#include "xfs_trace.h" + +STATIC int +xfs_trim_extents( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_fsblock_t start, + xfs_fsblock_t len, + xfs_fsblock_t minlen, + __uint64_t *blocks_trimmed) +{ + struct block_device *bdev = mp->m_ddev_targp->bt_bdev; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_perag *pag; + int error; + int i; + + pag = xfs_perag_get(mp, agno); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + + /* + * Force out the log. This means any transactions that might have freed + * space before we took the AGF buffer lock are now on disk, and the + * volatile disk cache is flushed. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Look up the longest btree in the AGF and start with it. + */ + error = xfs_alloc_lookup_le(cur, 0, + XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + if (error) + goto out_del_cursor; + + /* + * Loop until we are done with all extents that are large + * enough to be worth discarding. + */ + while (i) { + xfs_agblock_t fbno; + xfs_extlen_t flen; + + error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); + if (error) + goto out_del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + + /* + * Too small? Give up. + */ + if (flen < minlen) { + trace_xfs_discard_toosmall(mp, agno, fbno, flen); + goto out_del_cursor; + } + + /* + * If the extent is entirely outside of the range we are + * supposed to discard skip it. Do not bother to trim + * down partially overlapping ranges for now. + */ + if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || + XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + trace_xfs_discard_exclude(mp, agno, fbno, flen); + goto next_extent; + } + + /* + * If any blocks in the range are still busy, skip the + * discard and try again the next time. + */ + if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + trace_xfs_discard_busy(mp, agno, fbno, flen); + goto next_extent; + } + + trace_xfs_discard_extent(mp, agno, fbno, flen); + error = -blkdev_issue_discard(bdev, + XFS_AGB_TO_DADDR(mp, agno, fbno), + XFS_FSB_TO_BB(mp, flen), + GFP_NOFS, 0); + if (error) + goto out_del_cursor; + *blocks_trimmed += flen; + +next_extent: + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto out_del_cursor; + } + +out_del_cursor: + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); +out_put_perag: + xfs_perag_put(pag); + return error; +} + +int +xfs_ioc_trim( + struct xfs_mount *mp, + struct fstrim_range __user *urange) +{ + struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + unsigned int granularity = q->limits.discard_granularity; + struct fstrim_range range; + xfs_fsblock_t start, len, minlen; + xfs_agnumber_t start_agno, end_agno, agno; + __uint64_t blocks_trimmed = 0; + int error, last_error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&range, urange, sizeof(range))) + return -XFS_ERROR(EFAULT); + + /* + * Truncating down the len isn't actually quite correct, but using + * XFS_B_TO_FSB would mean we trivially get overflows for values + * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default + * used by the fstrim application. In the end it really doesn't + * matter as trimming blocks is an advisory interface. + */ + start = XFS_B_TO_FSBT(mp, range.start); + len = XFS_B_TO_FSBT(mp, range.len); + minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); + + start_agno = XFS_FSB_TO_AGNO(mp, start); + if (start_agno >= mp->m_sb.sb_agcount) + return -XFS_ERROR(EINVAL); + + end_agno = XFS_FSB_TO_AGNO(mp, start + len); + if (end_agno >= mp->m_sb.sb_agcount) + end_agno = mp->m_sb.sb_agcount - 1; + + for (agno = start_agno; agno <= end_agno; agno++) { + error = -xfs_trim_extents(mp, agno, start, len, minlen, + &blocks_trimmed); + if (error) + last_error = error; + } + + if (last_error) + return last_error; + + range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + if (copy_to_user(urange, &range, sizeof(range))) + return -XFS_ERROR(EFAULT); + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h new file mode 100644 index 00000000000..e82b6dd3e12 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.h @@ -0,0 +1,8 @@ +#ifndef XFS_DISCARD_H +#define XFS_DISCARD_H 1 + +struct fstrim_range; + +extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); + +#endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index ad442d9e392..b06ede1d0be 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -39,6 +39,7 @@ #include "xfs_dfrag.h" #include "xfs_fsops.h" #include "xfs_vnodeops.h" +#include "xfs_discard.h" #include "xfs_quota.h" #include "xfs_inode_item.h" #include "xfs_export.h" @@ -1294,6 +1295,8 @@ xfs_file_ioctl( trace_xfs_file_ioctl(ip); switch (cmd) { + case FITRIM: + return xfs_ioc_trim(mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 647af2a2e7a..2d0bcb47907 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -1759,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); +DECLARE_EVENT_CLASS(xfs_discard_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +) + +#define DEFINE_DISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_discard_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_DISCARD_EVENT(xfs_discard_extent); +DEFINE_DISCARD_EVENT(xfs_discard_toosmall); +DEFINE_DISCARD_EVENT(xfs_discard_exclude); +DEFINE_DISCARD_EVENT(xfs_discard_busy); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index fa8723f5870..f3227984a9b 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -41,10 +41,6 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -static int -xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); - /* * Prototypes for per-ag allocation routines */ @@ -94,7 +90,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -STATIC int /* error */ +int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -127,7 +123,7 @@ xfs_alloc_update( /* * Get the data from the pointed-to record. */ -STATIC int /* error */ +int /* error */ xfs_alloc_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t *bno, /* output: starting block of extent */ @@ -2615,7 +2611,7 @@ restart: * will require a synchronous transaction, but it can still be * used to distinguish between a partial or exact match. */ -static int +int xfs_alloc_busy_search( struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 895009a9727..0ab56b32c7e 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -19,6 +19,7 @@ #define __XFS_ALLOC_H__ struct xfs_buf; +struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; @@ -118,16 +119,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag); #ifdef __KERNEL__ - void -xfs_alloc_busy_insert(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); void xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); +int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); #endif /* __KERNEL__ */ /* @@ -205,4 +206,18 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + #endif /* __XFS_ALLOC_H__ */ -- cgit v1.2.3-70-g09d2 From bfc60177f8ab509bc225becbb58f7e53a0e33e81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Jan 2011 13:02:23 +0000 Subject: xfs: fix error handling for synchronous writes If we get an IO error on a synchronous superblock write, we attach an error release function to it so that when the last reference goes away the release function is called and the buffer is invalidated and unlocked. The buffer is left locked until the release function is called so that other concurrent users of the buffer will be locked out until the buffer error is fully processed. Unfortunately, for the superblock buffer the filesyetm itself holds a reference to the buffer which prevents the reference count from dropping to zero and the release function being called. As a result, once an IO error occurs on a sync write, the buffer will never be unlocked and all future attempts to lock the buffer will hang. To make matters worse, this problems is not unique to such buffers; if there is a concurrent _xfs_buf_find() running, the lookup will grab a reference to the buffer and then wait on the buffer lock, preventing the reference count from ever falling to zero and hence unlocking the buffer. As such, the whole b_relse function implementation is broken because it cannot rely on the buffer reference count falling to zero to unlock the errored buffer. The synchronous write error path is the only path that uses this callback - it is used to ensure that the synchronous waiter gets the buffer error before the error state is cleared from the buffer by the release function. Given that the only sychronous buffer writes now go through xfs_bwrite and the error path in question can only occur for a write of a dirty, logged buffer, we can move most of the b_relse processing to happen inline in xfs_buf_iodone_callbacks, just like a normal I/O completion. In addition to that we make sure the error is not cleared in xfs_buf_iodone_callbacks, so that xfs_bwrite can reliably check it. Given that xfs_bwrite keeps the buffer locked until it has waited for it and checked the error this allows to reliably propagate the error to the caller, and make sure that the buffer is reliably unlocked. Given that xfs_buf_iodone_callbacks was the only instance of the b_relse callback we can remove it entirely. Based on earlier patches by Dave Chinner and Ajeet Yadav. Signed-off-by: Christoph Hellwig Reported-by: Ajeet Yadav Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 7 +-- fs/xfs/linux-2.6/xfs_buf.h | 7 +-- fs/xfs/xfs_buf_item.c | 151 +++++++++++++++------------------------------ 3 files changed, 51 insertions(+), 114 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 92f1f2acc6a..ac1c7e8378d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -896,7 +896,6 @@ xfs_buf_rele( trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { - ASSERT(!bp->b_relse); ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) @@ -908,11 +907,7 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (bp->b_relse) { - atomic_inc(&bp->b_hold); - spin_unlock(&pag->pag_buf_lock); - bp->b_relse(bp); - } else if (!(bp->b_flags & XBF_STALE) && + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { xfs_buf_lru_add(bp); spin_unlock(&pag->pag_buf_lock); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a76c2428faf..cbe65950e52 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -152,8 +152,6 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); -typedef void (*xfs_buf_relse_t)(struct xfs_buf *); -typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); #define XB_PAGES 2 @@ -183,7 +181,6 @@ typedef struct xfs_buf { void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; xfs_buf_iodone_t b_iodone; /* I/O completion function */ - xfs_buf_relse_t b_relse; /* releasing function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; @@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) #define XFS_BUF_SET_START(bp) do { } while (0) -#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) @@ -360,8 +356,7 @@ xfs_buf_set_ref( static inline void xfs_buf_relse(xfs_buf_t *bp) { - if (!bp->b_relse) - xfs_buf_unlock(bp); + xfs_buf_unlock(bp); xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ed2b65f3f8b..98c6f73b675 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -141,7 +141,6 @@ xfs_buf_item_log_check( #define xfs_buf_item_log_check(x) #endif -STATIC void xfs_buf_error_relse(xfs_buf_t *bp); STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* @@ -959,128 +958,76 @@ xfs_buf_do_callbacks( */ void xfs_buf_iodone_callbacks( - xfs_buf_t *bp) + struct xfs_buf *bp) { - xfs_log_item_t *lip; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - xfs_mount_t *mp; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_mount *mp = lip->li_mountp; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (likely(!XFS_BUF_GETERROR(bp))) + goto do_callbacks; - if (XFS_BUF_GETERROR(bp) != 0) { - /* - * If we've already decided to shutdown the filesystem - * because of IO errors, there's no point in giving this - * a retry. - */ - mp = lip->li_mountp; - if (XFS_FORCED_SHUTDOWN(mp)) { - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - XFS_BUF_SUPER_STALE(bp); - trace_xfs_buf_item_iodone(bp, _RET_IP_); - xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); - return; - } + /* + * If we've already decided to shutdown the filesystem because of + * I/O errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + XFS_BUF_SUPER_STALE(bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); + goto do_callbacks; + } - if ((XFS_BUF_TARGET(bp) != lasttarg) || - (time_after(jiffies, (lasttime + 5*HZ)))) { - lasttime = jiffies; - cmn_err(CE_ALERT, "Device %s, XFS metadata write error" - " block 0x%llx in %s", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); - } - lasttarg = XFS_BUF_TARGET(bp); + if (XFS_BUF_TARGET(bp) != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + cmn_err(CE_ALERT, "Device %s, XFS metadata write error" + " block 0x%llx in %s", + XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); + } + lasttarg = XFS_BUF_TARGET(bp); - if (XFS_BUF_ISASYNC(bp)) { - /* - * If the write was asynchronous then noone will be - * looking for the error. Clear the error state - * and write the buffer out again delayed write. - * - * XXXsup This is OK, so long as we catch these - * before we start the umount; we don't want these - * DELWRI metadata bufs to be hanging around. - */ - XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ - - if (!(XFS_BUF_ISSTALE(bp))) { - XFS_BUF_DELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); - } - ASSERT(XFS_BUF_IODONE_FUNC(bp)); - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); - } else { - /* - * If the write of the buffer was not asynchronous, - * then we want to make sure to return the error - * to the caller of bwrite(). Because of this we - * cannot clear the B_ERROR state at this point. - * Instead we install a callback function that - * will be called when the buffer is released, and - * that routine will clear the error state and - * set the buffer to be written out again after - * some delay. - */ - /* We actually overwrite the existing b-relse - function at times, but we're gonna be shutting down - anyway. */ - XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); + /* + * If the write was asynchronous then noone will be looking for the + * error. Clear the error state and write the buffer out again. + * + * During sync or umount we'll write all pending buffers again + * synchronous, which will catch these errors if they keep hanging + * around. + */ + if (XFS_BUF_ISASYNC(bp)) { + XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + + if (!XFS_BUF_ISSTALE(bp)) { + XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_FINISH_IOWAIT(bp); + XFS_BUF_SET_START(bp); } + ASSERT(XFS_BUF_IODONE_FUNC(bp)); + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_relse(bp); return; } - xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); -} - -/* - * This is a callback routine attached to a buffer which gets an error - * when being written out synchronously. - */ -STATIC void -xfs_buf_error_relse( - xfs_buf_t *bp) -{ - xfs_log_item_t *lip; - xfs_mount_t *mp; - - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - mp = (xfs_mount_t *)lip->li_mountp; - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - + /* + * If the write of the buffer was synchronous, we want to make + * sure to return the error to the caller of xfs_bwrite(). + */ XFS_BUF_STALE(bp); XFS_BUF_DONE(bp); XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_ERROR(bp,0); trace_xfs_buf_error_relse(bp, _RET_IP_); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - if (! XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - /* - * We have to unpin the pinned buffers so do the - * callbacks. - */ +do_callbacks: xfs_buf_do_callbacks(bp); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_SET_BRELSE_FUNC(bp,NULL); - xfs_buf_relse(bp); + xfs_buf_ioend(bp, 0); } - /* * This is the iodone() function for buffers which have been * logged. It is called when they are eventually flushed out. -- cgit v1.2.3-70-g09d2 From 73efe4a4ddf8eb2b1cc7039e8a66a23a424961af Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 12 Jan 2011 00:35:42 +0000 Subject: xfs: prevent NMI timeouts in cmn_err We currently have a global error message buffer in cmn_err that is protected by a spin lock that disables interrupts. Recently there have been reports of NMI timeouts occurring when the console is being flooded by SCSI error reports due to cmn_err() getting stuck trying to print to the console while holding this lock (i.e. with interrupts disabled). The NMI watchdog is seeing this CPU as non-responding and so is triggering a panic. While the trigger for the reported case is SCSI errors, pretty much anything that spams the kernel log could cause this to occur. Realistically the only reason that we have the intemediate message buffer is to prepend the correct kernel log level prefix to the log message. The only reason we have the lock is to protect the global message buffer and the only reason the message buffer is global is to keep it off the stack. Hence if we can avoid needing a global message buffer we avoid needing the lock, and we can do this with a small amount of cleanup and some preprocessor tricks: 1. clean up xfs_cmn_err() panic mask functionality to avoid needing debug code in xfs_cmn_err() 2. remove the couple of "!" message prefixes that still exist that the existing cmn_err() code steps over. 3. redefine CE_* levels directly to KERN_* 4. redefine cmn_err() and friends to use printk() directly via variable argument length macros. By doing this, we can completely remove the cmn_err() code and the lock that is causing the problems, and rely solely on printk() serialisation to ensure that we don't get garbled messages. A series of followup patches is really needed to clean up all the cmn_err() calls and related messages properly, but that results in a series that is not easily back portable to enterprise kernels. Hence this initial fix is only to address the direct problem in the lowest impact way possible. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sysctl.c | 23 ++++++++- fs/xfs/support/debug.c | 109 +++++++++++++++++++----------------------- fs/xfs/support/debug.h | 25 ++++++---- fs/xfs/xfs_error.c | 31 ------------ fs/xfs/xfs_error.h | 18 +++---- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- 7 files changed, 96 insertions(+), 114 deletions(-) (limited to 'fs/xfs/linux-2.6') diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index 7bb5092d6ae..ee3cee097e7 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -18,6 +18,7 @@ #include "xfs.h" #include #include +#include "xfs_error.h" static struct ctl_table_header *xfs_table_header; @@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler( return ret; } + +STATIC int +xfs_panic_mask_proc_handler( + ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret, *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + if (!ret && write) { + xfs_panic_mask = *valp; +#ifdef DEBUG + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); +#endif + } + return ret; +} #endif /* CONFIG_PROC_FS */ static ctl_table xfs_table[] = { @@ -77,7 +98,7 @@ static ctl_table xfs_table[] = { .data = &xfs_params.panic_mask.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_panic_mask_proc_handler, .extra1 = &xfs_params.panic_mask.min, .extra2 = &xfs_params.panic_mask.max }, diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 86162e5f9a2..e6cf955ec0f 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -25,80 +25,71 @@ #include "xfs_mount.h" #include "xfs_error.h" -static char message[1024]; /* keep it off the stack */ -static DEFINE_SPINLOCK(xfs_err_lock); - -/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ -#define XFS_MAX_ERR_LEVEL 7 -#define XFS_ERR_MASK ((1 << 3) - 1) -static const char * const err_level[XFS_MAX_ERR_LEVEL+1] = - {KERN_EMERG, KERN_ALERT, KERN_CRIT, - KERN_ERR, KERN_WARNING, KERN_NOTICE, - KERN_INFO, KERN_DEBUG}; - void -cmn_err(register int level, char *fmt, ...) +cmn_err( + const char *lvl, + const char *fmt, + ...) { - char *fp = fmt; - int len; - ulong flags; - va_list ap; - - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; - spin_lock_irqsave(&xfs_err_lock,flags); - va_start(ap, fmt); - if (*fmt == '!') fp++; - len = vsnprintf(message, sizeof(message), fp, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; - printk("%s%s\n", err_level[level], message); - va_end(ap); - spin_unlock_irqrestore(&xfs_err_lock,flags); - BUG_ON(level == CE_PANIC); + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s%pV", lvl, &vaf); + va_end(args); + + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); } void -xfs_fs_vcmn_err( - int level, +xfs_fs_cmn_err( + const char *lvl, struct xfs_mount *mp, - char *fmt, - va_list ap) + const char *fmt, + ...) { - unsigned long flags; - int len = 0; + struct va_format vaf; + va_list args; - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - spin_lock_irqsave(&xfs_err_lock,flags); + printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf); + va_end(args); - if (mp) { - len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); +} + +/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */ +void +xfs_cmn_err( + int panic_tag, + const char *lvl, + struct xfs_mount *mp, + const char *fmt, + ...) +{ + struct va_format vaf; + va_list args; + int panic = 0; - /* - * Skip the printk if we can't print anything useful - * due to an over-long device name. - */ - if (len >= sizeof(message)) - goto out; + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + printk(KERN_ALERT "XFS: Transforming an alert into a BUG."); + panic = 1; } - len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - printk("%s%s\n", err_level[level], message); - out: - spin_unlock_irqrestore(&xfs_err_lock,flags); + printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf); + va_end(args); - BUG_ON(level == CE_PANIC); + BUG_ON(panic); } void diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index d2d20462fd4..05699f67d47 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -20,15 +20,22 @@ #include -#define CE_DEBUG 7 /* debug */ -#define CE_CONT 6 /* continuation */ -#define CE_NOTE 5 /* notice */ -#define CE_WARN 4 /* warning */ -#define CE_ALERT 1 /* alert */ -#define CE_PANIC 0 /* panic */ - -extern void cmn_err(int, char *, ...) - __attribute__ ((format (printf, 2, 3))); +struct xfs_mount; + +#define CE_DEBUG KERN_DEBUG +#define CE_CONT KERN_INFO +#define CE_NOTE KERN_NOTICE +#define CE_WARN KERN_WARNING +#define CE_ALERT KERN_ALERT +#define CE_PANIC KERN_EMERG + +void cmn_err(const char *lvl, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 4, 5))); + extern void assfail(char *expr, char *f, int l); #define ASSERT_ALWAYS(expr) \ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c78cc6a3d87..4c7db74a05f 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) } #endif /* DEBUG */ - -void -xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - -void -xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - -#ifdef DEBUG - xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); -#endif - - if (xfs_panic_mask && (xfs_panic_mask & panic_tag) - && (level & CE_ALERT)) { - level &= ~CE_ALERT; - level |= CE_PANIC; - cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG."); - } - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - void xfs_error_report( const char *tag, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index f338847f80b..10dce5475f0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); +extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) #define xfs_errortag_add(tag, mp) (ENOSYS) @@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); struct xfs_mount; -extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp, - char *fmt, va_list ap) - __attribute__ ((format (printf, 3, 0))); -extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, - char *fmt, ...) - __attribute__ ((format (printf, 4, 5))); -extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); - extern void xfs_hex_dump(void *p, int length); #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) #define xfs_fs_mount_cmn_err(f, fmt, args...) \ - ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) + do { \ + if (!(f & XFS_MFSI_QUIET)) \ + cmn_err(CE_WARN, "XFS: " fmt, ## args); \ + } while (0) #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0bf24b11d0c..ae6fef1ff56 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -377,7 +377,7 @@ xfs_log_mount( cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); else { cmn_err(CE_NOTE, - "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", + "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", mp->m_fsname); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 204d8e5fa7f..aa0ebb77690 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3800,7 +3800,7 @@ xlog_recover_finish( log->l_flags &= ~XLOG_RECOVERY_NEEDED; } else { cmn_err(CE_DEBUG, - "!Ending clean XFS mount for filesystem: %s\n", + "Ending clean XFS mount for filesystem: %s\n", log->l_mp->m_fsname); } return 0; -- cgit v1.2.3-70-g09d2