diff options
Diffstat (limited to 'fs/xfs')
30 files changed, 935 insertions, 608 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 0dce969d6ca..faca4499709 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ kmem.o \ xfs_aops.o \ xfs_buf.o \ + xfs_discard.o \ xfs_export.o \ xfs_file.o \ xfs_fs_subr.o \ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 92f1f2acc6a..ac1c7e8378d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -896,7 +896,6 @@ xfs_buf_rele( trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { - ASSERT(!bp->b_relse); ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) @@ -908,11 +907,7 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (bp->b_relse) { - atomic_inc(&bp->b_hold); - spin_unlock(&pag->pag_buf_lock); - bp->b_relse(bp); - } else if (!(bp->b_flags & XBF_STALE) && + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { xfs_buf_lru_add(bp); spin_unlock(&pag->pag_buf_lock); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a76c2428faf..cbe65950e52 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -152,8 +152,6 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); -typedef void (*xfs_buf_relse_t)(struct xfs_buf *); -typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); #define XB_PAGES 2 @@ -183,7 +181,6 @@ typedef struct xfs_buf { void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; xfs_buf_iodone_t b_iodone; /* I/O completion function */ - xfs_buf_relse_t b_relse; /* releasing function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; @@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) #define XFS_BUF_SET_START(bp) do { } while (0) -#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) @@ -360,8 +356,7 @@ xfs_buf_set_ref( static inline void xfs_buf_relse(xfs_buf_t *bp) { - if (!bp->b_relse) - xfs_buf_unlock(bp); + xfs_buf_unlock(bp); xfs_buf_rele(bp); } diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c new file mode 100644 index 00000000000..05201ae719e --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_discard.h" +#include "xfs_trace.h" + +STATIC int +xfs_trim_extents( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_fsblock_t start, + xfs_fsblock_t len, + xfs_fsblock_t minlen, + __uint64_t *blocks_trimmed) +{ + struct block_device *bdev = mp->m_ddev_targp->bt_bdev; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_perag *pag; + int error; + int i; + + pag = xfs_perag_get(mp, agno); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + + /* + * Force out the log. This means any transactions that might have freed + * space before we took the AGF buffer lock are now on disk, and the + * volatile disk cache is flushed. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Look up the longest btree in the AGF and start with it. + */ + error = xfs_alloc_lookup_le(cur, 0, + XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + if (error) + goto out_del_cursor; + + /* + * Loop until we are done with all extents that are large + * enough to be worth discarding. + */ + while (i) { + xfs_agblock_t fbno; + xfs_extlen_t flen; + + error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); + if (error) + goto out_del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + + /* + * Too small? Give up. + */ + if (flen < minlen) { + trace_xfs_discard_toosmall(mp, agno, fbno, flen); + goto out_del_cursor; + } + + /* + * If the extent is entirely outside of the range we are + * supposed to discard skip it. Do not bother to trim + * down partially overlapping ranges for now. + */ + if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || + XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + trace_xfs_discard_exclude(mp, agno, fbno, flen); + goto next_extent; + } + + /* + * If any blocks in the range are still busy, skip the + * discard and try again the next time. + */ + if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + trace_xfs_discard_busy(mp, agno, fbno, flen); + goto next_extent; + } + + trace_xfs_discard_extent(mp, agno, fbno, flen); + error = -blkdev_issue_discard(bdev, + XFS_AGB_TO_DADDR(mp, agno, fbno), + XFS_FSB_TO_BB(mp, flen), + GFP_NOFS, 0); + if (error) + goto out_del_cursor; + *blocks_trimmed += flen; + +next_extent: + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto out_del_cursor; + } + +out_del_cursor: + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); +out_put_perag: + xfs_perag_put(pag); + return error; +} + +int +xfs_ioc_trim( + struct xfs_mount *mp, + struct fstrim_range __user *urange) +{ + struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + unsigned int granularity = q->limits.discard_granularity; + struct fstrim_range range; + xfs_fsblock_t start, len, minlen; + xfs_agnumber_t start_agno, end_agno, agno; + __uint64_t blocks_trimmed = 0; + int error, last_error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&range, urange, sizeof(range))) + return -XFS_ERROR(EFAULT); + + /* + * Truncating down the len isn't actually quite correct, but using + * XFS_B_TO_FSB would mean we trivially get overflows for values + * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default + * used by the fstrim application. In the end it really doesn't + * matter as trimming blocks is an advisory interface. + */ + start = XFS_B_TO_FSBT(mp, range.start); + len = XFS_B_TO_FSBT(mp, range.len); + minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); + + start_agno = XFS_FSB_TO_AGNO(mp, start); + if (start_agno >= mp->m_sb.sb_agcount) + return -XFS_ERROR(EINVAL); + + end_agno = XFS_FSB_TO_AGNO(mp, start + len); + if (end_agno >= mp->m_sb.sb_agcount) + end_agno = mp->m_sb.sb_agcount - 1; + + for (agno = start_agno; agno <= end_agno; agno++) { + error = -xfs_trim_extents(mp, agno, start, len, minlen, + &blocks_trimmed); + if (error) + last_error = error; + } + + if (last_error) + return last_error; + + range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + if (copy_to_user(urange, &range, sizeof(range))) + return -XFS_ERROR(EFAULT); + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h new file mode 100644 index 00000000000..e82b6dd3e12 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.h @@ -0,0 +1,8 @@ +#ifndef XFS_DISCARD_H +#define XFS_DISCARD_H 1 + +struct fstrim_range; + +extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); + +#endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ba8ad422a16..a55c1b46b21 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -37,10 +37,45 @@ #include "xfs_trace.h" #include <linux/dcache.h> +#include <linux/falloc.h> static const struct vm_operations_struct xfs_file_vm_ops; /* + * Locking primitives for read and write IO paths to ensure we consistently use + * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + */ +static inline void +xfs_rw_ilock( + struct xfs_inode *ip, + int type) +{ + if (type & XFS_IOLOCK_EXCL) + mutex_lock(&VFS_I(ip)->i_mutex); + xfs_ilock(ip, type); +} + +static inline void +xfs_rw_iunlock( + struct xfs_inode *ip, + int type) +{ + xfs_iunlock(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +static inline void +xfs_rw_ilock_demote( + struct xfs_inode *ip, + int type) +{ + xfs_ilock_demote(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +/* * xfs_iozero * * xfs_iozero clears the specified range of buffer supplied, @@ -262,22 +297,21 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (iocb->ki_pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); + if (ret) { + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } } - mutex_unlock(&inode->i_mutex); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; - } - } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } else + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); @@ -285,7 +319,7 @@ xfs_file_aio_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -309,7 +343,7 @@ xfs_file_splice_read( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - xfs_ilock(ip, XFS_IOLOCK_SHARED); + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_splice_read(ip, count, *ppos, ioflags); @@ -317,10 +351,61 @@ xfs_file_splice_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } +STATIC void +xfs_aio_write_isize_update( + struct inode *inode, + loff_t *ppos, + ssize_t bytes_written) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t isize = i_size_read(inode); + + if (bytes_written > 0) + XFS_STATS_ADD(xs_write_bytes, bytes_written); + + if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && + *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * If this was a direct or synchronous I/O that failed (such as ENOSPC) then + * part of the I/O may have been written to disk before the error occured. In + * this case the on-disk file size may have been adjusted beyond the in-memory + * file size and now needs to be truncated back. + */ +STATIC void +xfs_aio_write_newsize_update( + struct xfs_inode *ip) +{ + if (ip->i_new_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * xfs_file_splice_write() does not use xfs_rw_ilock() because + * generic_file_splice_write() takes the i_mutex itself. This, in theory, + * couuld cause lock inversions between the aio_write path and the splice path + * if someone is doing concurrent splice(2) based writes and write(2) based + * writes to the same inode. The only real way to fix this is to re-implement + * the generic code here with correct locking orders. + */ STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, @@ -331,7 +416,7 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t isize, new_size; + xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -355,27 +440,9 @@ xfs_file_splice_write( trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_aio_write_isize_update(inode, ppos, ret); + xfs_aio_write_newsize_update(ip); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -562,247 +629,314 @@ out_lock: return error; } +/* + * Common pre-write limit and setup checks. + * + * Returns with iolock held according to @iolock. + */ STATIC ssize_t -xfs_file_aio_write( - struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) +xfs_file_aio_write_checks( + struct file *file, + loff_t *pos, + size_t *count, + int *iolock) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0, error = 0; - int ioflags = 0; - xfs_fsize_t isize, new_size; - int iolock; - size_t ocount = 0, count; - int need_i_mutex; + xfs_fsize_t new_size; + int error = 0; - XFS_STATS_INC(xs_write_calls); + error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); + if (error) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = 0; + return error; + } - BUG_ON(iocb->ki_pos != pos); + new_size = *pos + *count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + if (likely(!(file->f_mode & FMODE_NOCMTIME))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we need to zero any + * blocks that fall between the existing EOF and the start of this + * write. + */ + if (*pos > ip->i_size) + error = -xfs_zero_eof(ip, *pos, ip->i_size); - error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; - count = ocount; - if (count == 0) - return 0; - - xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + return file_remove_suid(file); - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; +} -relock: - if (ioflags & IO_ISDIRECT) { - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } else { - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By separating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. + * + * If there are cached pages or we're extending the file, we need IOLOCK_EXCL + * until we're sure the bytes at the new EOF have been zeroed and/or the cached + * pages are flushed out. + * + * In most cases the direct IO writes will be done holding IOLOCK_SHARED + * allowing them to be done in parallel with reads and other direct IO writes. + * However, if the IO is not aligned to filesystem blocks, the direct IO layer + * needs to do sub-block zeroing and that requires serialisation against other + * direct IOs to the same block. In this case we need to serialise the + * submission of the unaligned IOs so that we don't get racing block zeroing in + * the dio layer. To avoid the problem with aio, we also need to wait for + * outstanding IOs to complete so that unwritten extent conversion is completed + * before we try to map the overlapping block. This is currently implemented by + * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * + * Returns with locks held indicated by @iolock and errors indicated by + * negative return values. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + size_t count = ocount; + int unaligned_io = 0; + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + *iolock = 0; + if ((pos & target->bt_smask) || (count & target->bt_smask)) + return -XFS_ERROR(EINVAL); + + if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + unaligned_io = 1; + + if (unaligned_io || mapping->nrpages || pos > ip->i_size) + *iolock = XFS_IOLOCK_EXCL; + else + *iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; + + if (mapping->nrpages) { + WARN_ON(*iolock != XFS_IOLOCK_EXCL); + ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, + FI_REMAPF_LOCKED); + if (ret) + return ret; } - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); - -start: - error = -generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; + /* + * If we are doing unaligned IO, wait for all other IO to drain, + * otherwise demote the lock if we had to flush cached pages + */ + if (unaligned_io) + xfs_ioend_wait(ip); + else if (*iolock == XFS_IOLOCK_EXCL) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + *iolock = XFS_IOLOCK_SHARED; } - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_direct_write(iocb, iovp, + &nr_segs, pos, &iocb->ki_pos, count, ocount); - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } + /* No fallback to buffered IO on errors for XFS. */ + ASSERT(ret < 0 || ret == count); + return ret; +} - if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); - goto start; - } - } +STATIC ssize_t +xfs_file_buffered_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; + size_t count = ocount; - new_size = pos + count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - if (likely(!(ioflags & IO_INVIS))) - file_update_time(file); + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + +write_retry: + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_buffered_write(iocb, iovp, nr_segs, + pos, &iocb->ki_pos, count, ret); /* - * If the offset is beyond the size of the file, we have a couple - * of things to do. First, if there is already space allocated - * we need to either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need - * to zero it out up to the new size. + * if we just got an ENOSPC, flush the inode now we aren't holding any + * page locks and retry *once* */ - - if (pos > ip->i_size) { - error = xfs_zero_eof(ip, pos, ip->i_size); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_unlock_internal; - } + if (ret == -ENOSPC && !enospc) { + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (ret) + return ret; + enospc = 1; + goto write_retry; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); + current->backing_dev_info = NULL; + return ret; +} - /* - * If we're writing the file then make sure to clear the - * setuid and setgid bits if the process is not being run - * by root. This keeps people from modifying setuid and - * setgid binaries. - */ - error = -file_remove_suid(file); - if (unlikely(error)) - goto out_unlock_internal; +STATIC ssize_t +xfs_file_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int iolock; + size_t ocount = 0; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + XFS_STATS_INC(xs_write_calls); - if ((ioflags & IO_ISDIRECT)) { - if (mapping->nrpages) { - WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(ip, - (pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_internal; - } + BUG_ON(iocb->ki_pos != pos); - if (need_i_mutex) { - /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); + ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + if (ret) + return ret; - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } + if (ocount == 0) + return 0; - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; - pos += ret; - count -= ret; + if (unlikely(file->f_flags & O_DIRECT)) + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); + else + ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); - ioflags &= ~IO_ISDIRECT; - xfs_iunlock(ip, iolock); - goto relock; - } - } else { - int enospc = 0; - ssize_t ret2 = 0; + xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); -write_retry: - trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); - ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); - /* - * if we just got an ENOSPC, flush the inode now we - * aren't holding any page locks and retry *once* - */ - if (ret2 == -ENOSPC && !enospc) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (error) - goto out_unlock_internal; - enospc = 1; - goto write_retry; - } - ret = ret2; - } + if (ret <= 0) + goto out_unlock; - current->backing_dev_info = NULL; + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; + int error, error2; - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) - iocb->ki_pos = isize; + xfs_rw_iunlock(ip, iolock); + error = filemap_write_and_wait_range(mapping, pos, end); + xfs_rw_ilock(ip, iolock); - if (iocb->ki_pos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (iocb->ki_pos > ip->i_size) - ip->i_size = iocb->ki_pos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error2 = -xfs_file_fsync(file, + (file->f_flags & __O_SYNC) ? 0 : 1); + if (error) + ret = error; + else if (error2) + ret = error2; } - error = -ret; - if (ret <= 0) - goto out_unlock_internal; +out_unlock: + xfs_aio_write_newsize_update(ip); + xfs_rw_iunlock(ip, iolock); + return ret; +} - XFS_STATS_ADD(xs_write_bytes, ret); +STATIC long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + long error; + loff_t new_size = 0; + xfs_flock64_t bf; + xfs_inode_t *ip = XFS_I(inode); + int cmd = XFS_IOC_RESVSP; - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error2; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; - xfs_iunlock(ip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); + bf.l_whence = 0; + bf.l_start = offset; + bf.l_len = len; - error2 = filemap_write_and_wait_range(mapping, pos, end); - if (!error) - error = error2; - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, iolock); + xfs_ilock(ip, XFS_IOLOCK_EXCL); - error2 = -xfs_file_fsync(file, - (file->f_flags & __O_SYNC) ? 0 : 1); - if (!error) - error = error2; + if (mode & FALLOC_FL_PUNCH_HOLE) + cmd = XFS_IOC_UNRESVSP; + + /* check the new inode size is valid before allocating */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; } - out_unlock_internal: - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - /* - * If this was a direct or synchronous I/O that failed (such - * as ENOSPC) then part of the I/O may have been written to - * disk before the error occured. In this case the on-disk - * file size may have been adjusted beyond the in-memory file - * size and now needs to be truncated back. - */ - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); + if (error) + goto out_unlock; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); } - xfs_iunlock(ip, iolock); - out_unlock_mutex: - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - return -error; + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; } + STATIC int xfs_file_open( struct inode *inode, @@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = { .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index ad442d9e392..f5e2a19e0f8 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -39,6 +39,7 @@ #include "xfs_dfrag.h" #include "xfs_fsops.h" #include "xfs_vnodeops.h" +#include "xfs_discard.h" #include "xfs_quota.h" #include "xfs_inode_item.h" #include "xfs_export.h" @@ -984,10 +985,22 @@ xfs_ioctl_setattr( /* * Extent size must be a multiple of the appropriate block - * size, if set at all. + * size, if set at all. It must also be smaller than the + * maximum extent size supported by the filesystem. + * + * Also, for non-realtime files, limit the extent size hint to + * half the size of the AGs in the filesystem so alignment + * doesn't result in extents larger than an AG. */ if (fa->fsx_extsize != 0) { - xfs_extlen_t size; + xfs_extlen_t size; + xfs_fsblock_t extsize_fsb; + + extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); + if (extsize_fsb > MAXEXTLEN) { + code = XFS_ERROR(EINVAL); + goto error_return; + } if (XFS_IS_REALTIME_INODE(ip) || ((mask & FSX_XFLAGS) && @@ -996,6 +1009,10 @@ xfs_ioctl_setattr( mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; + if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { + code = XFS_ERROR(EINVAL); + goto error_return; + } } if (fa->fsx_extsize % size) { @@ -1294,6 +1311,8 @@ xfs_file_ioctl( trace_xfs_file_ioctl(ip); switch (cmd) { + case FITRIM: + return xfs_ioc_trim(mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index da54403633b..bd5727852fd 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -46,7 +46,6 @@ #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/security.h> -#include <linux/falloc.h> #include <linux/fiemap.h> #include <linux/slab.h> @@ -505,61 +504,6 @@ xfs_vn_setattr( return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); } -STATIC long -xfs_vn_fallocate( - struct inode *inode, - int mode, - loff_t offset, - loff_t len) -{ - long error; - loff_t new_size = 0; - xfs_flock64_t bf; - xfs_inode_t *ip = XFS_I(inode); - int cmd = XFS_IOC_RESVSP; - - /* preallocation on directories not yet supported */ - error = -ENODEV; - if (S_ISDIR(inode->i_mode)) - goto out_error; - - bf.l_whence = 0; - bf.l_start = offset; - bf.l_len = len; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - if (mode & FALLOC_FL_PUNCH_HOLE) - cmd = XFS_IOC_UNRESVSP; - - /* check the new inode size is valid before allocating */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } - - error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); - if (error) - goto out_unlock; - - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); - } - -out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); -out_error: - return error; -} - #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) /* @@ -653,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .fallocate = xfs_vn_fallocate, .fiemap = xfs_vn_fiemap, }; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index bd07f733936..9731898083a 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1414,7 +1414,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return -xfs_fs_log_dummy(mp, SYNC_WAIT); + return -xfs_fs_log_dummy(mp); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a02480de975..e22f0057d21 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -362,7 +362,7 @@ xfs_quiesce_data( /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); + error2 = xfs_fs_log_dummy(mp); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -503,13 +503,14 @@ xfs_sync_worker( int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_log_force(mp, 0); - xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ - error = xfs_qm_sync(mp, SYNC_TRYLOCK); if (mp->m_super->s_frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp, 0); + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + xfs_reclaim_inodes(mp, 0); + error = xfs_qm_sync(mp, SYNC_TRYLOCK); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index 7bb5092d6ae..ee3cee097e7 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -18,6 +18,7 @@ #include "xfs.h" #include <linux/sysctl.h> #include <linux/proc_fs.h> +#include "xfs_error.h" static struct ctl_table_header *xfs_table_header; @@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler( return ret; } + +STATIC int +xfs_panic_mask_proc_handler( + ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret, *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + if (!ret && write) { + xfs_panic_mask = *valp; +#ifdef DEBUG + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); +#endif + } + return ret; +} #endif /* CONFIG_PROC_FS */ static ctl_table xfs_table[] = { @@ -77,7 +98,7 @@ static ctl_table xfs_table[] = { .data = &xfs_params.panic_mask.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_panic_mask_proc_handler, .extra1 = &xfs_params.panic_mask.min, .extra2 = &xfs_params.panic_mask.max }, diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 647af2a2e7a..2d0bcb47907 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -1759,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); +DECLARE_EVENT_CLASS(xfs_discard_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +) + +#define DEFINE_DISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_discard_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_DISCARD_EVENT(xfs_discard_extent); +DEFINE_DISCARD_EVENT(xfs_discard_toosmall); +DEFINE_DISCARD_EVENT(xfs_discard_exclude); +DEFINE_DISCARD_EVENT(xfs_discard_busy); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index f8e854b4fde..206a2815ced 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -1863,12 +1863,14 @@ xfs_qm_dqreclaim_one(void) xfs_dquot_t *dqpout; xfs_dquot_t *dqp; int restarts; + int startagain; restarts = 0; dqpout = NULL; /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ -startagain: +again: + startagain = 0; mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { @@ -1885,13 +1887,10 @@ startagain: ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); trace_xfs_dqreclaim_want(dqp); - - xfs_dqunlock(dqp); - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto startagain; + restarts++; + startagain = 1; + goto dqunlock; } /* @@ -1906,23 +1905,20 @@ startagain: ASSERT(list_empty(&dqp->q_mplist)); list_del_init(&dqp->q_freelist); xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); dqpout = dqp; XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - break; + goto dqunlock; } ASSERT(dqp->q_hash); ASSERT(!list_empty(&dqp->q_mplist)); /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. + * Try to grab the flush lock. If this dquot is in the process + * of getting flushed to disk, we don't want to reclaim it. */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - continue; - } + if (!xfs_dqflock_nowait(dqp)) + goto dqunlock; /* * We have the flush lock so we know that this is not in the @@ -1944,8 +1940,7 @@ startagain: xfs_fs_cmn_err(CE_WARN, mp, "xfs_qm_dqreclaim: dquot %p flush failed", dqp); } - xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - continue; + goto dqunlock; } /* @@ -1967,13 +1962,8 @@ startagain: */ if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { restarts++; - mutex_unlock(&dqp->q_hash->qh_lock); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; - goto startagain; + startagain = 1; + goto qhunlock; } ASSERT(dqp->q_nrefs == 0); @@ -1986,14 +1976,20 @@ startagain: xfs_Gqm->qm_dqfrlist_cnt--; dqpout = dqp; mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); +qhunlock: mutex_unlock(&dqp->q_hash->qh_lock); dqfunlock: xfs_dqfunlock(dqp); +dqunlock: xfs_dqunlock(dqp); if (dqpout) break; if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; + break; + if (startagain) { + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + goto again; + } } mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); return dqpout; diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 975aa10e1a4..0df88897ef8 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -25,86 +25,78 @@ #include "xfs_mount.h" #include "xfs_error.h" -static char message[1024]; /* keep it off the stack */ -static DEFINE_SPINLOCK(xfs_err_lock); - -/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ -#define XFS_MAX_ERR_LEVEL 7 -#define XFS_ERR_MASK ((1 << 3) - 1) -static const char * const err_level[XFS_MAX_ERR_LEVEL+1] = - {KERN_EMERG, KERN_ALERT, KERN_CRIT, - KERN_ERR, KERN_WARNING, KERN_NOTICE, - KERN_INFO, KERN_DEBUG}; - void -cmn_err(register int level, char *fmt, ...) +cmn_err( + const char *lvl, + const char *fmt, + ...) { - char *fp = fmt; - int len; - ulong flags; - va_list ap; - - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; - spin_lock_irqsave(&xfs_err_lock,flags); - va_start(ap, fmt); - if (*fmt == '!') fp++; - len = vsnprintf(message, sizeof(message), fp, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; - printk("%s%s\n", err_level[level], message); - va_end(ap); - spin_unlock_irqrestore(&xfs_err_lock,flags); - BUG_ON(level == CE_PANIC); + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s%pV", lvl, &vaf); + va_end(args); + + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); } void -xfs_fs_vcmn_err( - int level, +xfs_fs_cmn_err( + const char *lvl, struct xfs_mount *mp, - char *fmt, - va_list ap) + const char *fmt, + ...) { - unsigned long flags; - int len = 0; + struct va_format vaf; + va_list args; - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - spin_lock_irqsave(&xfs_err_lock,flags); + printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf); + va_end(args); - if (mp) { - len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); +} + +/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */ +void +xfs_cmn_err( + int panic_tag, + const char *lvl, + struct xfs_mount *mp, + const char *fmt, + ...) +{ + struct va_format vaf; + va_list args; + int do_panic = 0; - /* - * Skip the printk if we can't print anything useful - * due to an over-long device name. - */ - if (len >= sizeof(message)) - goto out; + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + printk(KERN_ALERT "XFS: Transforming an alert into a BUG."); + do_panic = 1; } - len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - printk("%s%s\n", err_level[level], message); - out: - spin_unlock_irqrestore(&xfs_err_lock,flags); + printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf); + va_end(args); - BUG_ON(level == CE_PANIC); + BUG_ON(do_panic); } void assfail(char *expr, char *file, int line) { - printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); + printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr, + file, line); BUG(); } diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index d2d20462fd4..05699f67d47 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -20,15 +20,22 @@ #include <stdarg.h> -#define CE_DEBUG 7 /* debug */ -#define CE_CONT 6 /* continuation */ -#define CE_NOTE 5 /* notice */ -#define CE_WARN 4 /* warning */ -#define CE_ALERT 1 /* alert */ -#define CE_PANIC 0 /* panic */ - -extern void cmn_err(int, char *, ...) - __attribute__ ((format (printf, 2, 3))); +struct xfs_mount; + +#define CE_DEBUG KERN_DEBUG +#define CE_CONT KERN_INFO +#define CE_NOTE KERN_NOTICE +#define CE_WARN KERN_WARNING +#define CE_ALERT KERN_ALERT +#define CE_PANIC KERN_EMERG + +void cmn_err(const char *lvl, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 4, 5))); + extern void assfail(char *expr, char *f, int l); #define ASSERT_ALWAYS(expr) \ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index fa8723f5870..f3227984a9b 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -41,10 +41,6 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -static int -xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); - /* * Prototypes for per-ag allocation routines */ @@ -94,7 +90,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -STATIC int /* error */ +int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -127,7 +123,7 @@ xfs_alloc_update( /* * Get the data from the pointed-to record. */ -STATIC int /* error */ +int /* error */ xfs_alloc_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t *bno, /* output: starting block of extent */ @@ -2615,7 +2611,7 @@ restart: * will require a synchronous transaction, but it can still be * used to distinguish between a partial or exact match. */ -static int +int xfs_alloc_busy_search( struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 895009a9727..d0b3bc72005 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -19,6 +19,7 @@ #define __XFS_ALLOC_H__ struct xfs_buf; +struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; @@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) /* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks + * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +#define XFS_ALLOC_AG_MAX_USABLE(mp) \ + ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) + + +/* * Argument structure for xfs_alloc routines. * This is turned into a structure to avoid having 20 arguments passed * down several levels of the stack. @@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag); #ifdef __KERNEL__ - void -xfs_alloc_busy_insert(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); void xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); +int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); #endif /* __KERNEL__ */ /* @@ -205,4 +222,18 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 4111cd3966c..dc3afd7739f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real( * Filling in the middle part of a previous delayed allocation. * Contiguity is impossible here. * This case is avoided almost all the time. + * + * We start with a delayed allocation: + * + * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+ + * PREV @ idx + * + * and we are allocating: + * +rrrrrrrrrrrrrrrrr+ + * new + * + * and we set it up for insertion as: + * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+ + * new + * PREV @ idx LEFT RIGHT + * inserted at idx + 1 */ temp = new->br_startoff - PREV.br_startoff; - trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - r[0] = *new; - r[1].br_state = PREV.br_state; - r[1].br_startblock = 0; - r[1].br_startoff = new_endoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - r[1].br_blockcount = temp2; - xfs_iext_insert(ip, idx + 1, 2, &r[0], state); + trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ + LEFT = *new; + RIGHT.br_state = PREV.br_state; + RIGHT.br_startblock = nullstartblock( + (int)xfs_bmap_worst_indlen(ip, temp2)); + RIGHT.br_startoff = new_endoff; + RIGHT.br_blockcount = temp2; + /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ + xfs_iext_insert(ip, idx + 1, 2, &LEFT, state); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) @@ -2430,7 +2447,7 @@ xfs_bmap_btalloc_nullfb( startag = ag = 0; pag = xfs_perag_get(mp, ag); - while (*blen < ap->alen) { + while (*blen < args->maxlen) { if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, args->tp, ag, XFS_ALLOC_FLAG_TRYLOCK); @@ -2452,7 +2469,7 @@ xfs_bmap_btalloc_nullfb( notinit = 1; if (xfs_inode_is_filestream(ap->ip)) { - if (*blen >= ap->alen) + if (*blen >= args->maxlen) break; if (ap->userdata) { @@ -2498,14 +2515,14 @@ xfs_bmap_btalloc_nullfb( * If the best seen length is less than the request * length, use the best as the minimum. */ - else if (*blen < ap->alen) + else if (*blen < args->maxlen) args->minlen = *blen; /* - * Otherwise we've seen an extent as big as alen, + * Otherwise we've seen an extent as big as maxlen, * use that as the minimum. */ else - args->minlen = ap->alen; + args->minlen = args->maxlen; /* * set the failure fallback case to look in the selected @@ -2573,7 +2590,9 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->rval; - args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); + + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); args.firstblock = ap->firstblock; blen = 0; if (nullfb) { @@ -2621,7 +2640,7 @@ xfs_bmap_btalloc( /* * Adjust for alignment */ - if (blen > args.alignment && blen <= ap->alen) + if (blen > args.alignment && blen <= args.maxlen) args.minlen = blen - args.alignment; args.minalignslop = 0; } else { @@ -2640,7 +2659,7 @@ xfs_bmap_btalloc( * of minlen+alignment+slop doesn't go up * between the calls. */ - if (blen > mp->m_dalign && blen <= ap->alen) + if (blen > mp->m_dalign && blen <= args.maxlen) nextminlen = blen - mp->m_dalign; else nextminlen = args.minlen; @@ -4485,6 +4504,16 @@ xfs_bmapi( /* Figure out the extent size, adjust alen */ extsz = xfs_get_extsz_hint(ip); if (extsz) { + /* + * make sure we don't exceed a single + * extent length when we align the + * extent by reducing length we are + * going to allocate by the maximum + * amount extent size aligment may + * require. + */ + alen = XFS_FILBLKS_MIN(len, + MAXEXTLEN - (2 * extsz - 1)); error = xfs_bmap_extsize_align(mp, &got, &prev, extsz, rt, eof, diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ed2b65f3f8b..6f8c21ce0d6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -141,7 +141,6 @@ xfs_buf_item_log_check( #define xfs_buf_item_log_check(x) #endif -STATIC void xfs_buf_error_relse(xfs_buf_t *bp); STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* @@ -428,13 +427,15 @@ xfs_buf_item_unpin( if (remove) { /* - * We have to remove the log item from the transaction - * as we are about to release our reference to the - * buffer. If we don't, the unlock that occurs later - * in xfs_trans_uncommit() will ry to reference the + * If we are in a transaction context, we have to + * remove the log item from the transaction as we are + * about to release our reference to the buffer. If we + * don't, the unlock that occurs later in + * xfs_trans_uncommit() will try to reference the * buffer which we no longer have a hold on. */ - xfs_trans_del_item(lip); + if (lip->li_desc) + xfs_trans_del_item(lip); /* * Since the transaction no longer refers to the buffer, @@ -959,128 +960,76 @@ xfs_buf_do_callbacks( */ void xfs_buf_iodone_callbacks( - xfs_buf_t *bp) + struct xfs_buf *bp) { - xfs_log_item_t *lip; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - xfs_mount_t *mp; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_mount *mp = lip->li_mountp; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (likely(!XFS_BUF_GETERROR(bp))) + goto do_callbacks; - if (XFS_BUF_GETERROR(bp) != 0) { - /* - * If we've already decided to shutdown the filesystem - * because of IO errors, there's no point in giving this - * a retry. - */ - mp = lip->li_mountp; - if (XFS_FORCED_SHUTDOWN(mp)) { - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - XFS_BUF_SUPER_STALE(bp); - trace_xfs_buf_item_iodone(bp, _RET_IP_); - xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); - return; - } + /* + * If we've already decided to shutdown the filesystem because of + * I/O errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + XFS_BUF_SUPER_STALE(bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); + goto do_callbacks; + } - if ((XFS_BUF_TARGET(bp) != lasttarg) || - (time_after(jiffies, (lasttime + 5*HZ)))) { - lasttime = jiffies; - cmn_err(CE_ALERT, "Device %s, XFS metadata write error" - " block 0x%llx in %s", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); - } - lasttarg = XFS_BUF_TARGET(bp); + if (XFS_BUF_TARGET(bp) != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + cmn_err(CE_ALERT, "Device %s, XFS metadata write error" + " block 0x%llx in %s", + XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); + } + lasttarg = XFS_BUF_TARGET(bp); - if (XFS_BUF_ISASYNC(bp)) { - /* - * If the write was asynchronous then noone will be - * looking for the error. Clear the error state - * and write the buffer out again delayed write. - * - * XXXsup This is OK, so long as we catch these - * before we start the umount; we don't want these - * DELWRI metadata bufs to be hanging around. - */ - XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ - - if (!(XFS_BUF_ISSTALE(bp))) { - XFS_BUF_DELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); - } - ASSERT(XFS_BUF_IODONE_FUNC(bp)); - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); - } else { - /* - * If the write of the buffer was not asynchronous, - * then we want to make sure to return the error - * to the caller of bwrite(). Because of this we - * cannot clear the B_ERROR state at this point. - * Instead we install a callback function that - * will be called when the buffer is released, and - * that routine will clear the error state and - * set the buffer to be written out again after - * some delay. - */ - /* We actually overwrite the existing b-relse - function at times, but we're gonna be shutting down - anyway. */ - XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); + /* + * If the write was asynchronous then noone will be looking for the + * error. Clear the error state and write the buffer out again. + * + * During sync or umount we'll write all pending buffers again + * synchronous, which will catch these errors if they keep hanging + * around. + */ + if (XFS_BUF_ISASYNC(bp)) { + XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + + if (!XFS_BUF_ISSTALE(bp)) { + XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_FINISH_IOWAIT(bp); + XFS_BUF_SET_START(bp); } + ASSERT(XFS_BUF_IODONE_FUNC(bp)); + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_relse(bp); return; } - xfs_buf_do_callbacks(bp); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); -} - -/* - * This is a callback routine attached to a buffer which gets an error - * when being written out synchronously. - */ -STATIC void -xfs_buf_error_relse( - xfs_buf_t *bp) -{ - xfs_log_item_t *lip; - xfs_mount_t *mp; - - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - mp = (xfs_mount_t *)lip->li_mountp; - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - + /* + * If the write of the buffer was synchronous, we want to make + * sure to return the error to the caller of xfs_bwrite(). + */ XFS_BUF_STALE(bp); XFS_BUF_DONE(bp); XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_ERROR(bp,0); trace_xfs_buf_error_relse(bp, _RET_IP_); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - if (! XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - /* - * We have to unpin the pinned buffers so do the - * callbacks. - */ +do_callbacks: xfs_buf_do_callbacks(bp); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_SET_BRELSE_FUNC(bp,NULL); - xfs_buf_relse(bp); + xfs_buf_ioend(bp, 0); } - /* * This is the iodone() function for buffers which have been * logged. It is called when they are eventually flushed out. diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c78cc6a3d87..4c7db74a05f 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) } #endif /* DEBUG */ - -void -xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - -void -xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - -#ifdef DEBUG - xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); -#endif - - if (xfs_panic_mask && (xfs_panic_mask & panic_tag) - && (level & CE_ALERT)) { - level &= ~CE_ALERT; - level |= CE_PANIC; - cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG."); - } - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - void xfs_error_report( const char *tag, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index f338847f80b..10dce5475f0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); +extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) #define xfs_errortag_add(tag, mp) (ENOSYS) @@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); struct xfs_mount; -extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp, - char *fmt, va_list ap) - __attribute__ ((format (printf, 3, 0))); -extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, - char *fmt, ...) - __attribute__ ((format (printf, 4, 5))); -extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); - extern void xfs_hex_dump(void *p, int length); #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) #define xfs_fs_mount_cmn_err(f, fmt, args...) \ - ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) + do { \ + if (!(f & XFS_MFSI_QUIET)) \ + cmn_err(CE_WARN, "XFS: " fmt, ## args); \ + } while (0) #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 75f2ef60e57..d22e6262343 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -138,7 +138,8 @@ xfs_efi_item_unpin( if (remove) { ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); - xfs_trans_del_item(lip); + if (lip->li_desc) + xfs_trans_del_item(lip); xfs_efi_item_free(efip); return; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f56d30e8040..cec89dd5d7d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -612,12 +612,13 @@ out: * * We cannot use an inode here for this - that will push dirty state back up * into the VFS and then periodic inode flushing will prevent log covering from - * making progress. Hence we log a field in the superblock instead. + * making progress. Hence we log a field in the superblock instead and use a + * synchronous transaction to ensure the superblock is immediately unpinned + * and can be written back. */ int xfs_fs_log_dummy( - xfs_mount_t *mp, - int flags) + xfs_mount_t *mp) { xfs_trans_t *tp; int error; @@ -632,8 +633,7 @@ xfs_fs_log_dummy( /* log the UUID because it is an unchanging field */ xfs_mod_sb(tp, XFS_SB_UUID); - if (flags & SYNC_WAIT) - xfs_trans_set_sync(tp); + xfs_trans_set_sync(tp); return xfs_trans_commit(tp, 0); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index a786c5212c1..1b6a98b6688 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); +extern int xfs_fs_log_dummy(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 55582bd6665..8a0f044750c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -337,7 +337,12 @@ xfs_iomap_prealloc_size( int shift = 0; int64_t freesp; - alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size); + /* + * rounddown_pow_of_two() returns an undefined result + * if we pass in alloc_blocks = 0. Hence the "+ 1" to + * ensure we always pass in a non-zero value. + */ + alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0bf24b11d0c..ae6fef1ff56 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -377,7 +377,7 @@ xfs_log_mount( cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); else { cmn_err(CE_NOTE, - "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", + "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", mp->m_fsname); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 916eb7db14d..3bd3291ef8d 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -191,7 +191,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket); xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); -int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, +void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_log_vec *log_vector, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9dc8125d04e..9ca59be0897 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -543,7 +543,7 @@ xlog_cil_push( error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); if (error) - goto out_abort; + goto out_abort_free_ticket; /* * now that we've written the checkpoint into the log, strictly @@ -569,8 +569,9 @@ restart: } spin_unlock(&cil->xc_cil_lock); + /* xfs_log_done always frees the ticket on error. */ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); - if (error || commit_lsn == -1) + if (commit_lsn == -1) goto out_abort; /* attach all the transactions w/ busy extents to iclog */ @@ -600,6 +601,8 @@ out_free_ticket: kmem_free(new_ctx); return 0; +out_abort_free_ticket: + xfs_log_ticket_put(tic); out_abort: xlog_cil_committed(ctx, XFS_LI_ABORTED); return XFS_ERROR(EIO); @@ -622,7 +625,7 @@ out_abort: * background commit, returns without it held once background commits are * allowed again. */ -int +void xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, @@ -637,11 +640,6 @@ xfs_log_commit_cil( if (flags & XFS_TRANS_RELEASE_LOG_RES) log_flags = XFS_LOG_REL_PERM_RESERV; - if (XLOG_FORCED_SHUTDOWN(log)) { - xlog_cil_free_logvec(log_vector); - return XFS_ERROR(EIO); - } - /* * do all the hard work of formatting items (including memory * allocation) outside the CIL context lock. This prevents stalling CIL @@ -701,7 +699,6 @@ xfs_log_commit_cil( */ if (push) xlog_cil_push(log, 0); - return 0; } /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 204d8e5fa7f..aa0ebb77690 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3800,7 +3800,7 @@ xlog_recover_finish( log->l_flags &= ~XLOG_RECOVERY_NEEDED; } else { cmn_err(CE_DEBUG, - "!Ending clean XFS mount for filesystem: %s\n", + "Ending clean XFS mount for filesystem: %s\n", log->l_mp->m_fsname); } return 0; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f80a067a465..76922793f64 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1137,7 +1137,7 @@ out_undo_fdblocks: if (blkdelta) xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); out: - ASSERT(error = 0); + ASSERT(error == 0); return; } @@ -1446,6 +1446,14 @@ xfs_log_item_batch_insert( * Bulk operation version of xfs_trans_committed that takes a log vector of * items to insert into the AIL. This uses bulk AIL insertion techniques to * minimise lock traffic. + * + * If we are called with the aborted flag set, it is because a log write during + * a CIL checkpoint commit has failed. In this case, all the items in the + * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which + * means that checkpoint commit abort handling is treated exactly the same + * as an iclog write error even though we haven't started any IO yet. Hence in + * this case all we need to do is IOP_COMMITTED processing, followed by an + * IOP_UNPIN(aborted) call. */ void xfs_trans_committed_bulk( @@ -1472,6 +1480,16 @@ xfs_trans_committed_bulk( if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) continue; + /* + * if we are aborting the operation, no point in inserting the + * object into the AIL as we are in a shutdown situation. + */ + if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); + IOP_UNPIN(lip, 1); + continue; + } + if (item_lsn != commit_lsn) { /* @@ -1503,20 +1521,24 @@ xfs_trans_committed_bulk( } /* - * Called from the trans_commit code when we notice that - * the filesystem is in the middle of a forced shutdown. + * Called from the trans_commit code when we notice that the filesystem is in + * the middle of a forced shutdown. + * + * When we are called here, we have already pinned all the items in the + * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called + * so we can simply walk the items in the transaction, unpin them with an abort + * flag and then free the items. Note that unpinning the items can result in + * them being freed immediately, so we need to use a safe list traversal method + * here. */ STATIC void xfs_trans_uncommit( struct xfs_trans *tp, uint flags) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item_desc *lidp, *n; - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* - * Unpin all but those that aren't dirty. - */ + list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) { if (lidp->lid_flags & XFS_LID_DIRTY) IOP_UNPIN(lidp->lid_item, 1); } @@ -1733,7 +1755,6 @@ xfs_trans_commit_cil( int flags) { struct xfs_log_vec *log_vector; - int error; /* * Get each log item to allocate a vector structure for @@ -1744,9 +1765,7 @@ xfs_trans_commit_cil( if (!log_vector) return ENOMEM; - error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); - if (error) - return error; + xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); |