diff options
Diffstat (limited to 'fs/xfs')
54 files changed, 2942 insertions, 2524 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 0dce969d6ca..faca4499709 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ kmem.o \ xfs_aops.o \ xfs_buf.o \ + xfs_discard.o \ xfs_export.o \ xfs_file.o \ xfs_fs_subr.o \ diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h deleted file mode 100644 index 4dfc7c37081..00000000000 --- a/fs/xfs/linux-2.6/sv.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_SV_H__ -#define __XFS_SUPPORT_SV_H__ - -#include <linux/wait.h> -#include <linux/sched.h> -#include <linux/spinlock.h> - -/* - * Synchronisation variables. - * - * (Parameters "pri", "svf" and "rts" are not implemented) - */ - -typedef struct sv_s { - wait_queue_head_t waiters; -} sv_t; - -static inline void _sv_wait(sv_t *sv, spinlock_t *lock) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&sv->waiters, &wait); - __set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock(lock); - - schedule(); - - remove_wait_queue(&sv->waiters, &wait); -} - -#define sv_init(sv,flag,name) \ - init_waitqueue_head(&(sv)->waiters) -#define sv_destroy(sv) \ - /*NOTHING*/ -#define sv_wait(sv, pri, lock, s) \ - _sv_wait(sv, lock) -#define sv_signal(sv) \ - wake_up(&(sv)->waiters) -#define sv_broadcast(sv) \ - wake_up_all(&(sv)->waiters) - -#endif /* __XFS_SUPPORT_SV_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 691f61223ed..ec7bbb5645b 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -38,15 +38,6 @@ #include <linux/pagevec.h> #include <linux/writeback.h> -/* - * Types of I/O for bmap clustering and I/O completion tracking. - */ -enum { - IO_READ, /* mapping for a read */ - IO_DELAY, /* mapping covers delalloc region */ - IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ - IO_NEW /* just allocated */ -}; /* * Prime number of hash buckets since address is used as the key. @@ -182,9 +173,6 @@ xfs_setfilesize( xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - ASSERT(ioend->io_type != IO_READ); - if (unlikely(ioend->io_error)) return 0; @@ -244,10 +232,8 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IO_READ) { - error = xfs_setfilesize(ioend); - ASSERT(!error || error == EAGAIN); - } + error = xfs_setfilesize(ioend); + ASSERT(!error || error == EAGAIN); /* * If we didn't complete processing of the ioend, requeue it to the @@ -318,14 +304,63 @@ STATIC int xfs_map_blocks( struct inode *inode, loff_t offset, - ssize_t count, struct xfs_bmbt_irec *imap, - int flags) + int type, + int nonblocking) { - int nmaps = 1; - int new = 0; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t count = 1 << inode->i_blkbits; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (type == IO_UNWRITTEN) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (nonblocking) + return -XFS_ERROR(EAGAIN); + xfs_ilock(ip, XFS_ILOCK_SHARED); + } - return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + (ip->i_df.if_flags & XFS_IFEXTENTS)); + ASSERT(offset <= mp->m_maxioffset); + + if (offset + count > mp->m_maxioffset) + count = mp->m_maxioffset - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, + bmapi_flags, NULL, 0, imap, &nimaps, NULL); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (error) + return -XFS_ERROR(error); + + if (type == IO_DELALLOC && + (!nimaps || isnullstartblock(imap->br_startblock))) { + error = xfs_iomap_write_allocate(ip, offset, count, imap); + if (!error) + trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); + return -XFS_ERROR(error); + } + +#ifdef DEBUG + if (type == IO_UNWRITTEN) { + ASSERT(nimaps); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + } +#endif + if (nimaps) + trace_xfs_map_blocks_found(ip, offset, count, type, imap); + return 0; } STATIC int @@ -380,26 +415,18 @@ xfs_submit_ioend_bio( submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE, bio); - ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); - bio_put(bio); } STATIC struct bio * xfs_alloc_ioend_bio( struct buffer_head *bh) { - struct bio *bio; int nvecs = bio_get_nr_vecs(bh->b_bdev); - - do { - bio = bio_alloc(GFP_NOIO, nvecs); - nvecs >>= 1; - } while (!bio); + struct bio *bio = bio_alloc(GFP_NOIO, nvecs); ASSERT(bio->bi_private == NULL); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio_get(bio); return bio; } @@ -470,9 +497,8 @@ xfs_submit_ioend( /* Pass 1 - start writeback */ do { next = ioend->io_list; - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) xfs_start_buffer_writeback(bh); - } } while ((ioend = next) != NULL); /* Pass 2 - submit I/O */ @@ -600,117 +626,13 @@ xfs_map_at_offset( ASSERT(imap->br_startblock != HOLESTARTBLOCK); ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - lock_buffer(bh); xfs_map_buffer(inode, bh, imap, offset); - bh->b_bdev = xfs_find_bdev_for_inode(inode); set_buffer_mapped(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); } /* - * Look for a page at index that is suitable for clustering. - */ -STATIC unsigned int -xfs_probe_page( - struct page *page, - unsigned int pg_offset) -{ - struct buffer_head *bh, *head; - int ret = 0; - - if (PageWriteback(page)) - return 0; - if (!PageDirty(page)) - return 0; - if (!page->mapping) - return 0; - if (!page_has_buffers(page)) - return 0; - - bh = head = page_buffers(page); - do { - if (!buffer_uptodate(bh)) - break; - if (!buffer_mapped(bh)) - break; - ret += bh->b_size; - if (ret >= pg_offset) - break; - } while ((bh = bh->b_this_page) != head); - - return ret; -} - -STATIC size_t -xfs_probe_cluster( - struct inode *inode, - struct page *startpage, - struct buffer_head *bh, - struct buffer_head *head) -{ - struct pagevec pvec; - pgoff_t tindex, tlast, tloff; - size_t total = 0; - int done = 0, i; - - /* First sum forwards in this page */ - do { - if (!buffer_uptodate(bh) || !buffer_mapped(bh)) - return total; - total += bh->b_size; - } while ((bh = bh->b_this_page) != head); - - /* if we reached the end of the page, sum forwards in following pages */ - tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; - tindex = startpage->index + 1; - - /* Prune this back to avoid pathological behavior */ - tloff = min(tlast, startpage->index + 64); - - pagevec_init(&pvec, 0); - while (!done && tindex <= tloff) { - unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); - - if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) - break; - - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - size_t pg_offset, pg_len = 0; - - if (tindex == tlast) { - pg_offset = - i_size_read(inode) & (PAGE_CACHE_SIZE - 1); - if (!pg_offset) { - done = 1; - break; - } - } else - pg_offset = PAGE_CACHE_SIZE; - - if (page->index == tindex && trylock_page(page)) { - pg_len = xfs_probe_page(page, pg_offset); - unlock_page(page); - } - - if (!pg_len) { - done = 1; - break; - } - - total += pg_len; - tindex++; - } - - pagevec_release(&pvec); - cond_resched(); - } - - return total; -} - -/* * Test if a given page is suitable for writing as part of an unwritten * or delayed allocate extent. */ @@ -731,9 +653,9 @@ xfs_is_delayed_page( if (buffer_unwritten(bh)) acceptable = (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IO_DELAY); + acceptable = (type == IO_DELALLOC); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IO_NEW); + acceptable = (type == IO_OVERWRITE); else break; } while ((bh = bh->b_this_page) != head); @@ -758,8 +680,7 @@ xfs_convert_page( loff_t tindex, struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, - struct writeback_control *wbc, - int all_bh) + struct writeback_control *wbc) { struct buffer_head *bh, *head; xfs_off_t end_offset; @@ -814,37 +735,30 @@ xfs_convert_page( continue; } - if (buffer_unwritten(bh) || buffer_delay(bh)) { + if (buffer_unwritten(bh) || buffer_delay(bh) || + buffer_mapped(bh)) { if (buffer_unwritten(bh)) type = IO_UNWRITTEN; + else if (buffer_delay(bh)) + type = IO_DELALLOC; else - type = IO_DELAY; + type = IO_OVERWRITE; if (!xfs_imap_valid(inode, imap, offset)) { done = 1; continue; } - ASSERT(imap->br_startblock != HOLESTARTBLOCK); - ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - - xfs_map_at_offset(inode, bh, imap, offset); + lock_buffer(bh); + if (type != IO_OVERWRITE) + xfs_map_at_offset(inode, bh, imap, offset); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); page_dirty--; count++; } else { - type = IO_NEW; - if (buffer_mapped(bh) && all_bh) { - lock_buffer(bh); - xfs_add_to_ioend(inode, bh, offset, - type, ioendp, done); - count++; - page_dirty--; - } else { - done = 1; - } + done = 1; } } while (offset += len, (bh = bh->b_this_page) != head); @@ -876,7 +790,6 @@ xfs_cluster_write( struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int all_bh, pgoff_t tlast) { struct pagevec pvec; @@ -891,7 +804,7 @@ xfs_cluster_write( for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, - imap, ioendp, wbc, all_bh); + imap, ioendp, wbc); if (done) break; } @@ -935,7 +848,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_is_delayed_page(page, IO_DELAY)) + if (!xfs_is_delayed_page(page, IO_DELALLOC)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -1002,10 +915,10 @@ xfs_vm_writepage( unsigned int type; __uint64_t end_offset; pgoff_t end_index, last_index; - ssize_t size, len; - int flags, err, imap_valid = 0, uptodate = 1; + ssize_t len; + int err, imap_valid = 0, uptodate = 1; int count = 0; - int all_bh = 0; + int nonblocking = 0; trace_xfs_writepage(inode, page, 0); @@ -1056,10 +969,14 @@ xfs_vm_writepage( bh = head = page_buffers(page); offset = page_offset(page); - flags = BMAPI_READ; - type = IO_NEW; + type = IO_OVERWRITE; + + if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + nonblocking = 1; do { + int new_ioend = 0; + if (offset >= end_offset) break; if (!buffer_uptodate(bh)) @@ -1076,90 +993,54 @@ xfs_vm_writepage( continue; } - if (imap_valid) - imap_valid = xfs_imap_valid(inode, &imap, offset); - - if (buffer_unwritten(bh) || buffer_delay(bh)) { - int new_ioend = 0; - - /* - * Make sure we don't use a read-only iomap - */ - if (flags == BMAPI_READ) - imap_valid = 0; - - if (buffer_unwritten(bh)) { + if (buffer_unwritten(bh)) { + if (type != IO_UNWRITTEN) { type = IO_UNWRITTEN; - flags = BMAPI_WRITE | BMAPI_IGNSTATE; - } else if (buffer_delay(bh)) { - type = IO_DELAY; - flags = BMAPI_ALLOCATE; - - if (wbc->sync_mode == WB_SYNC_NONE) - flags |= BMAPI_TRYLOCK; - } - - if (!imap_valid) { - /* - * If we didn't have a valid mapping then we - * need to ensure that we put the new mapping - * in a new ioend structure. This needs to be - * done to ensure that the ioends correctly - * reflect the block mappings at io completion - * for unwritten extent conversion. - */ - new_ioend = 1; - err = xfs_map_blocks(inode, offset, len, - &imap, flags); - if (err) - goto error; - imap_valid = xfs_imap_valid(inode, &imap, - offset); + imap_valid = 0; } - if (imap_valid) { - xfs_map_at_offset(inode, bh, &imap, offset); - xfs_add_to_ioend(inode, bh, offset, type, - &ioend, new_ioend); - count++; + } else if (buffer_delay(bh)) { + if (type != IO_DELALLOC) { + type = IO_DELALLOC; + imap_valid = 0; } } else if (buffer_uptodate(bh)) { - /* - * we got here because the buffer is already mapped. - * That means it must already have extents allocated - * underneath it. Map the extent by reading it. - */ - if (!imap_valid || flags != BMAPI_READ) { - flags = BMAPI_READ; - size = xfs_probe_cluster(inode, page, bh, head); - err = xfs_map_blocks(inode, offset, size, - &imap, flags); - if (err) - goto error; - imap_valid = xfs_imap_valid(inode, &imap, - offset); + if (type != IO_OVERWRITE) { + type = IO_OVERWRITE; + imap_valid = 0; } + } else { + if (PageUptodate(page)) { + ASSERT(buffer_mapped(bh)); + imap_valid = 0; + } + continue; + } + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); + if (!imap_valid) { /* - * We set the type to IO_NEW in case we are doing a - * small write at EOF that is extending the file but - * without needing an allocation. We need to update the - * file size on I/O completion in this case so it is - * the same case as having just allocated a new extent - * that we are writing into for the first time. + * If we didn't have a valid mapping then we need to + * put the new mapping into a separate ioend structure. + * This ensures non-contiguous extents always have + * separate ioends, which is particularly important + * for unwritten extent conversion at I/O completion + * time. */ - type = IO_NEW; - if (trylock_buffer(bh)) { - if (imap_valid) - all_bh = 1; - xfs_add_to_ioend(inode, bh, offset, type, - &ioend, !imap_valid); - count++; - } else { - imap_valid = 0; - } - } else if (PageUptodate(page)) { - ASSERT(buffer_mapped(bh)); - imap_valid = 0; + new_ioend = 1; + err = xfs_map_blocks(inode, offset, &imap, type, + nonblocking); + if (err) + goto error; + imap_valid = xfs_imap_valid(inode, &imap, offset); + } + if (imap_valid) { + lock_buffer(bh); + if (type != IO_OVERWRITE) + xfs_map_at_offset(inode, bh, &imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, &ioend, + new_ioend); + count++; } if (!iohead) @@ -1188,7 +1069,7 @@ xfs_vm_writepage( end_index = last_index; xfs_cluster_write(inode, page->index + 1, &imap, &ioend, - wbc, all_bh, end_index); + wbc, end_index); } if (iohead) @@ -1257,13 +1138,19 @@ __xfs_get_blocks( int create, int direct) { - int flags = create ? BMAPI_WRITE : BMAPI_READ; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; struct xfs_bmbt_irec imap; + int nimaps = 1; xfs_off_t offset; ssize_t size; - int nimap = 1; int new = 0; - int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); @@ -1272,15 +1159,45 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; - if (direct && create) - flags |= BMAPI_DIRECT; + if (create) { + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); + } else { + lockmode = xfs_ilock_map_shared(ip); + } + + ASSERT(offset <= mp->m_maxioffset); + if (offset + size > mp->m_maxioffset) + size = mp->m_maxioffset - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); + offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, - &new); + error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL); if (error) - return -error; - if (nimap == 0) - return 0; + goto out_unlock; + + if (create && + (!nimaps || + (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK))) { + if (direct) { + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, size, &imap); + } + if (error) + goto out_unlock; + + trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); + } else if (nimaps) { + trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + } else { + trace_xfs_get_blocks_notfound(ip, offset, size); + goto out_unlock; + } + xfs_iunlock(ip, lockmode); if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK) { @@ -1347,6 +1264,10 @@ __xfs_get_blocks( } return 0; + +out_unlock: + xfs_iunlock(ip, lockmode); + return -error; } int @@ -1434,7 +1355,7 @@ xfs_vm_direct_IO( ssize_t ret; if (rw & WRITE) { - iocb->private = xfs_alloc_ioend(inode, IO_NEW); + iocb->private = xfs_alloc_ioend(inode, IO_DIRECT); ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index c5057fb6237..71f721e1a71 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue; extern mempool_t *xfs_ioend_pool; /* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + IO_DIRECT = 0, /* special case for direct I/O ioends */ + IO_DELALLOC, /* mapping covers delalloc region */ + IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ + IO_OVERWRITE, /* mapping covers already allocated extent */ +}; + +#define XFS_IO_TYPES \ + { 0, "" }, \ + { IO_DELALLOC, "delalloc" }, \ + { IO_UNWRITTEN, "unwritten" }, \ + { IO_OVERWRITE, "overwrite" } + +/* * xfs_ioend struct manages large extent writes for XFS. * It can manage several multi-page bio's at once. */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 4c5deb6e9e3..ac1c7e8378d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -44,12 +44,7 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); -static struct shrinker xfs_buf_shake = { - .shrink = xfsbufd_wakeup, - .seeks = DEFAULT_SEEKS, -}; static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; @@ -168,8 +163,79 @@ test_page_region( } /* - * Internal xfs_buf_t object manipulation + * xfs_buf_lru_add - add a buffer to the LRU. + * + * The LRU takes a new reference to the buffer so that it will only be freed + * once the shrinker takes the buffer off the LRU. */ +STATIC void +xfs_buf_lru_add( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; + + spin_lock(&btp->bt_lru_lock); + if (list_empty(&bp->b_lru)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_lru, &btp->bt_lru); + btp->bt_lru_nr++; + } + spin_unlock(&btp->bt_lru_lock); +} + +/* + * xfs_buf_lru_del - remove a buffer from the LRU + * + * The unlocked check is safe here because it only occurs when there are not + * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there + * to optimise the shrinker removing the buffer from the LRU and calling + * xfs_buf_free(). i.e. it removes an unneccessary round trip on the + * bt_lru_lock. + */ +STATIC void +xfs_buf_lru_del( + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = bp->b_target; + + if (list_empty(&bp->b_lru)) + return; + + spin_lock(&btp->bt_lru_lock); + if (!list_empty(&bp->b_lru)) { + list_del_init(&bp->b_lru); + btp->bt_lru_nr--; + } + spin_unlock(&btp->bt_lru_lock); +} + +/* + * When we mark a buffer stale, we remove the buffer from the LRU and clear the + * b_lru_ref count so that the buffer is freed immediately when the buffer + * reference count falls to zero. If the buffer is already on the LRU, we need + * to remove the reference that LRU holds on the buffer. + * + * This prevents build-up of stale buffers on the LRU. + */ +void +xfs_buf_stale( + struct xfs_buf *bp) +{ + bp->b_flags |= XBF_STALE; + atomic_set(&(bp)->b_lru_ref, 0); + if (!list_empty(&bp->b_lru)) { + struct xfs_buftarg *btp = bp->b_target; + + spin_lock(&btp->bt_lru_lock); + if (!list_empty(&bp->b_lru)) { + list_del_init(&bp->b_lru); + btp->bt_lru_nr--; + atomic_dec(&bp->b_hold); + } + spin_unlock(&btp->bt_lru_lock); + } + ASSERT(atomic_read(&bp->b_hold) >= 1); +} STATIC void _xfs_buf_initialize( @@ -186,7 +252,9 @@ _xfs_buf_initialize( memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); + atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); + INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ @@ -262,6 +330,8 @@ xfs_buf_free( { trace_xfs_buf_free(bp, _RET_IP_); + ASSERT(list_empty(&bp->b_lru)); + if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; @@ -337,7 +407,6 @@ _xfs_buf_lookup_pages( __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(NULL, 0, gfp_mask); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -827,7 +896,7 @@ xfs_buf_rele( trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { - ASSERT(!bp->b_relse); + ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) xfs_buf_free(bp); @@ -835,13 +904,15 @@ xfs_buf_rele( } ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); + ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (bp->b_relse) { - atomic_inc(&bp->b_hold); + if (!(bp->b_flags & XBF_STALE) && + atomic_read(&bp->b_lru_ref)) { + xfs_buf_lru_add(bp); spin_unlock(&pag->pag_buf_lock); - bp->b_relse(bp); } else { + xfs_buf_lru_del(bp); ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); @@ -1438,51 +1509,84 @@ xfs_buf_iomove( */ /* - * Wait for any bufs with callbacks that have been submitted but - * have not yet returned... walk the hash list for the target. + * Wait for any bufs with callbacks that have been submitted but have not yet + * returned. These buffers will have an elevated hold count, so wait on those + * while freeing all the buffers only held by the LRU. */ void xfs_wait_buftarg( struct xfs_buftarg *btp) { - struct xfs_perag *pag; - uint i; + struct xfs_buf *bp; - for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { - pag = xfs_perag_get(btp->bt_mount, i); - spin_lock(&pag->pag_buf_lock); - while (rb_first(&pag->pag_buf_tree)) { - spin_unlock(&pag->pag_buf_lock); +restart: + spin_lock(&btp->bt_lru_lock); + while (!list_empty(&btp->bt_lru)) { + bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); + if (atomic_read(&bp->b_hold) > 1) { + spin_unlock(&btp->bt_lru_lock); delay(100); - spin_lock(&pag->pag_buf_lock); + goto restart; } - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); + /* + * clear the LRU reference count so the bufer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + spin_unlock(&btp->bt_lru_lock); + xfs_buf_rele(bp); + spin_lock(&btp->bt_lru_lock); } + spin_unlock(&btp->bt_lru_lock); } -/* - * buftarg list for delwrite queue processing - */ -static LIST_HEAD(xfs_buftarg_list); -static DEFINE_SPINLOCK(xfs_buftarg_lock); - -STATIC void -xfs_register_buftarg( - xfs_buftarg_t *btp) +int +xfs_buftarg_shrink( + struct shrinker *shrink, + int nr_to_scan, + gfp_t mask) { - spin_lock(&xfs_buftarg_lock); - list_add(&btp->bt_list, &xfs_buftarg_list); - spin_unlock(&xfs_buftarg_lock); -} + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + struct xfs_buf *bp; + LIST_HEAD(dispose); -STATIC void -xfs_unregister_buftarg( - xfs_buftarg_t *btp) -{ - spin_lock(&xfs_buftarg_lock); - list_del(&btp->bt_list); - spin_unlock(&xfs_buftarg_lock); + if (!nr_to_scan) + return btp->bt_lru_nr; + + spin_lock(&btp->bt_lru_lock); + while (!list_empty(&btp->bt_lru)) { + if (nr_to_scan-- <= 0) + break; + + bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); + + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + list_move_tail(&bp->b_lru, &btp->bt_lru); + continue; + } + + /* + * remove the buffer from the LRU now to avoid needing another + * lock round trip inside xfs_buf_rele(). + */ + list_move(&bp->b_lru, &dispose); + btp->bt_lru_nr--; + } + spin_unlock(&btp->bt_lru_lock); + + while (!list_empty(&dispose)) { + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + xfs_buf_rele(bp); + } + + return btp->bt_lru_nr; } void @@ -1490,17 +1594,14 @@ xfs_free_buftarg( struct xfs_mount *mp, struct xfs_buftarg *btp) { + unregister_shrinker(&btp->bt_shrinker); + xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); iput(btp->bt_mapping->host); - /* Unregister the buftarg first so that we don't get a - * wakeup finding a non-existent task - */ - xfs_unregister_buftarg(btp); kthread_stop(btp->bt_task); - kmem_free(btp); } @@ -1597,20 +1698,13 @@ xfs_alloc_delwrite_queue( xfs_buftarg_t *btp, const char *fsname) { - int error = 0; - - INIT_LIST_HEAD(&btp->bt_list); INIT_LIST_HEAD(&btp->bt_delwrite_queue); spin_lock_init(&btp->bt_delwrite_lock); btp->bt_flags = 0; btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); - if (IS_ERR(btp->bt_task)) { - error = PTR_ERR(btp->bt_task); - goto out_error; - } - xfs_register_buftarg(btp); -out_error: - return error; + if (IS_ERR(btp->bt_task)) + return PTR_ERR(btp->bt_task); + return 0; } xfs_buftarg_t * @@ -1627,12 +1721,17 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; + INIT_LIST_HEAD(&btp->bt_lru); + spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; if (xfs_mapping_buftarg(btp, bdev)) goto error; if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; + btp->bt_shrinker.shrink = xfs_buftarg_shrink; + btp->bt_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&btp->bt_shrinker); return btp; error: @@ -1737,27 +1836,6 @@ xfs_buf_runall_queues( flush_workqueue(queue); } -STATIC int -xfsbufd_wakeup( - struct shrinker *shrink, - int priority, - gfp_t mask) -{ - xfs_buftarg_t *btp; - - spin_lock(&xfs_buftarg_lock); - list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { - if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) - continue; - if (list_empty(&btp->bt_delwrite_queue)) - continue; - set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); - wake_up_process(btp->bt_task); - } - spin_unlock(&xfs_buftarg_lock); - return 0; -} - /* * Move as many buffers as specified to the supplied list * idicating if we skipped any buffers to prevent deadlocks. @@ -1952,7 +2030,6 @@ xfs_buf_init(void) if (!xfsconvertd_workqueue) goto out_destroy_xfsdatad_workqueue; - register_shrinker(&xfs_buf_shake); return 0; out_destroy_xfsdatad_workqueue: @@ -1968,7 +2045,6 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - unregister_shrinker(&xfs_buf_shake); destroy_workqueue(xfsconvertd_workqueue); destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 383a3f37cf9..cbe65950e52 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -128,10 +128,15 @@ typedef struct xfs_buftarg { /* per device delwri queue */ struct task_struct *bt_task; - struct list_head bt_list; struct list_head bt_delwrite_queue; spinlock_t bt_delwrite_lock; unsigned long bt_flags; + + /* LRU control structures */ + struct shrinker bt_shrinker; + struct list_head bt_lru; + spinlock_t bt_lru_lock; + unsigned int bt_lru_nr; } xfs_buftarg_t; /* @@ -147,8 +152,6 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); -typedef void (*xfs_buf_relse_t)(struct xfs_buf *); -typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); #define XB_PAGES 2 @@ -164,9 +167,11 @@ typedef struct xfs_buf { xfs_off_t b_file_offset; /* offset in file */ size_t b_buffer_length;/* size of buffer in bytes */ atomic_t b_hold; /* reference count */ + atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ + struct list_head b_lru; /* lru list */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ @@ -176,7 +181,6 @@ typedef struct xfs_buf { void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; xfs_buf_iodone_t b_iodone; /* I/O completion function */ - xfs_buf_relse_t b_relse; /* releasing function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; @@ -264,7 +268,8 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) -#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) +void xfs_buf_stale(struct xfs_buf *bp); +#define XFS_BUF_STALE(bp) xfs_buf_stale(bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) #define XFS_BUF_SUPER_STALE(bp) do { \ @@ -315,7 +320,6 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) #define XFS_BUF_SET_START(bp) do { } while (0) -#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) @@ -328,9 +332,15 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) #define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) +static inline void +xfs_buf_set_ref( + struct xfs_buf *bp, + int lru_ref) +{ + atomic_set(&bp->b_lru_ref, lru_ref); +} +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) #define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) -#define XFS_BUF_SET_REF(bp, ref) do { } while (0) #define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) @@ -346,8 +356,7 @@ extern void xfs_buf_terminate(void); static inline void xfs_buf_relse(xfs_buf_t *bp) { - if (!bp->b_relse) - xfs_buf_unlock(bp); + xfs_buf_unlock(bp); xfs_buf_rele(bp); } diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c new file mode 100644 index 00000000000..d61611c8801 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_discard.h" +#include "xfs_trace.h" + +STATIC int +xfs_trim_extents( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_fsblock_t start, + xfs_fsblock_t len, + xfs_fsblock_t minlen, + __uint64_t *blocks_trimmed) +{ + struct block_device *bdev = mp->m_ddev_targp->bt_bdev; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_perag *pag; + int error; + int i; + + pag = xfs_perag_get(mp, agno); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + + /* + * Force out the log. This means any transactions that might have freed + * space before we took the AGF buffer lock are now on disk, and the + * volatile disk cache is flushed. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Look up the longest btree in the AGF and start with it. + */ + error = xfs_alloc_lookup_le(cur, 0, + XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + if (error) + goto out_del_cursor; + + /* + * Loop until we are done with all extents that are large + * enough to be worth discarding. + */ + while (i) { + xfs_agblock_t fbno; + xfs_extlen_t flen; + + error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); + if (error) + goto out_del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + + /* + * Too small? Give up. + */ + if (flen < minlen) { + trace_xfs_discard_toosmall(mp, agno, fbno, flen); + goto out_del_cursor; + } + + /* + * If the extent is entirely outside of the range we are + * supposed to discard skip it. Do not bother to trim + * down partially overlapping ranges for now. + */ + if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || + XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) { + trace_xfs_discard_exclude(mp, agno, fbno, flen); + goto next_extent; + } + + /* + * If any blocks in the range are still busy, skip the + * discard and try again the next time. + */ + if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + trace_xfs_discard_busy(mp, agno, fbno, flen); + goto next_extent; + } + + trace_xfs_discard_extent(mp, agno, fbno, flen); + error = -blkdev_issue_discard(bdev, + XFS_AGB_TO_DADDR(mp, agno, fbno), + XFS_FSB_TO_BB(mp, flen), + GFP_NOFS, 0); + if (error) + goto out_del_cursor; + *blocks_trimmed += flen; + +next_extent: + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto out_del_cursor; + } + +out_del_cursor: + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); +out_put_perag: + xfs_perag_put(pag); + return error; +} + +int +xfs_ioc_trim( + struct xfs_mount *mp, + struct fstrim_range __user *urange) +{ + struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + unsigned int granularity = q->limits.discard_granularity; + struct fstrim_range range; + xfs_fsblock_t start, len, minlen; + xfs_agnumber_t start_agno, end_agno, agno; + __uint64_t blocks_trimmed = 0; + int error, last_error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (!blk_queue_discard(q)) + return -XFS_ERROR(EOPNOTSUPP); + if (copy_from_user(&range, urange, sizeof(range))) + return -XFS_ERROR(EFAULT); + + /* + * Truncating down the len isn't actually quite correct, but using + * XFS_B_TO_FSB would mean we trivially get overflows for values + * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default + * used by the fstrim application. In the end it really doesn't + * matter as trimming blocks is an advisory interface. + */ + start = XFS_B_TO_FSBT(mp, range.start); + len = XFS_B_TO_FSBT(mp, range.len); + minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); + + start_agno = XFS_FSB_TO_AGNO(mp, start); + if (start_agno >= mp->m_sb.sb_agcount) + return -XFS_ERROR(EINVAL); + + end_agno = XFS_FSB_TO_AGNO(mp, start + len); + if (end_agno >= mp->m_sb.sb_agcount) + end_agno = mp->m_sb.sb_agcount - 1; + + for (agno = start_agno; agno <= end_agno; agno++) { + error = -xfs_trim_extents(mp, agno, start, len, minlen, + &blocks_trimmed); + if (error) + last_error = error; + } + + if (last_error) + return last_error; + + range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + if (copy_to_user(urange, &range, sizeof(range))) + return -XFS_ERROR(EFAULT); + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h new file mode 100644 index 00000000000..e82b6dd3e12 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_discard.h @@ -0,0 +1,8 @@ +#ifndef XFS_DISCARD_H +#define XFS_DISCARD_H 1 + +struct fstrim_range; + +extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); + +#endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 3764d74790e..fc0114da7fd 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -70,8 +70,16 @@ xfs_fs_encode_fh( else fileid_type = FILEID_INO32_GEN_PARENT; - /* filesystem may contain 64bit inode numbers */ - if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) + /* + * If the the filesystem may contain 64bit inode numbers, we need + * to use larger file handles that can represent them. + * + * While we only allocate inodes that do not fit into 32 bits any + * large enough filesystem may contain them, thus the slightly + * confusing looking conditional below. + */ + if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) || + (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES)) fileid_type |= XFS_FILEID_TYPE_64FLAG; /* diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ba8ad422a16..a55c1b46b21 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -37,10 +37,45 @@ #include "xfs_trace.h" #include <linux/dcache.h> +#include <linux/falloc.h> static const struct vm_operations_struct xfs_file_vm_ops; /* + * Locking primitives for read and write IO paths to ensure we consistently use + * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + */ +static inline void +xfs_rw_ilock( + struct xfs_inode *ip, + int type) +{ + if (type & XFS_IOLOCK_EXCL) + mutex_lock(&VFS_I(ip)->i_mutex); + xfs_ilock(ip, type); +} + +static inline void +xfs_rw_iunlock( + struct xfs_inode *ip, + int type) +{ + xfs_iunlock(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +static inline void +xfs_rw_ilock_demote( + struct xfs_inode *ip, + int type) +{ + xfs_ilock_demote(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +/* * xfs_iozero * * xfs_iozero clears the specified range of buffer supplied, @@ -262,22 +297,21 @@ xfs_file_aio_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (iocb->ki_pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); + if (ret) { + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } } - mutex_unlock(&inode->i_mutex); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; - } - } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } else + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); @@ -285,7 +319,7 @@ xfs_file_aio_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -309,7 +343,7 @@ xfs_file_splice_read( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - xfs_ilock(ip, XFS_IOLOCK_SHARED); + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_splice_read(ip, count, *ppos, ioflags); @@ -317,10 +351,61 @@ xfs_file_splice_read( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } +STATIC void +xfs_aio_write_isize_update( + struct inode *inode, + loff_t *ppos, + ssize_t bytes_written) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fsize_t isize = i_size_read(inode); + + if (bytes_written > 0) + XFS_STATS_ADD(xs_write_bytes, bytes_written); + + if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && + *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * If this was a direct or synchronous I/O that failed (such as ENOSPC) then + * part of the I/O may have been written to disk before the error occured. In + * this case the on-disk file size may have been adjusted beyond the in-memory + * file size and now needs to be truncated back. + */ +STATIC void +xfs_aio_write_newsize_update( + struct xfs_inode *ip) +{ + if (ip->i_new_size) { + xfs_rw_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + } +} + +/* + * xfs_file_splice_write() does not use xfs_rw_ilock() because + * generic_file_splice_write() takes the i_mutex itself. This, in theory, + * couuld cause lock inversions between the aio_write path and the splice path + * if someone is doing concurrent splice(2) based writes and write(2) based + * writes to the same inode. The only real way to fix this is to re-implement + * the generic code here with correct locking orders. + */ STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, @@ -331,7 +416,7 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t isize, new_size; + xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -355,27 +440,9 @@ xfs_file_splice_write( trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_aio_write_isize_update(inode, ppos, ret); + xfs_aio_write_newsize_update(ip); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -562,247 +629,314 @@ out_lock: return error; } +/* + * Common pre-write limit and setup checks. + * + * Returns with iolock held according to @iolock. + */ STATIC ssize_t -xfs_file_aio_write( - struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) +xfs_file_aio_write_checks( + struct file *file, + loff_t *pos, + size_t *count, + int *iolock) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0, error = 0; - int ioflags = 0; - xfs_fsize_t isize, new_size; - int iolock; - size_t ocount = 0, count; - int need_i_mutex; + xfs_fsize_t new_size; + int error = 0; - XFS_STATS_INC(xs_write_calls); + error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); + if (error) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = 0; + return error; + } - BUG_ON(iocb->ki_pos != pos); + new_size = *pos + *count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + if (likely(!(file->f_mode & FMODE_NOCMTIME))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we need to zero any + * blocks that fall between the existing EOF and the start of this + * write. + */ + if (*pos > ip->i_size) + error = -xfs_zero_eof(ip, *pos, ip->i_size); - error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; - count = ocount; - if (count == 0) - return 0; - - xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + return file_remove_suid(file); - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; +} -relock: - if (ioflags & IO_ISDIRECT) { - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } else { - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By separating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. + * + * If there are cached pages or we're extending the file, we need IOLOCK_EXCL + * until we're sure the bytes at the new EOF have been zeroed and/or the cached + * pages are flushed out. + * + * In most cases the direct IO writes will be done holding IOLOCK_SHARED + * allowing them to be done in parallel with reads and other direct IO writes. + * However, if the IO is not aligned to filesystem blocks, the direct IO layer + * needs to do sub-block zeroing and that requires serialisation against other + * direct IOs to the same block. In this case we need to serialise the + * submission of the unaligned IOs so that we don't get racing block zeroing in + * the dio layer. To avoid the problem with aio, we also need to wait for + * outstanding IOs to complete so that unwritten extent conversion is completed + * before we try to map the overlapping block. This is currently implemented by + * hitting it with a big hammer (i.e. xfs_ioend_wait()). + * + * Returns with locks held indicated by @iolock and errors indicated by + * negative return values. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + size_t count = ocount; + int unaligned_io = 0; + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + *iolock = 0; + if ((pos & target->bt_smask) || (count & target->bt_smask)) + return -XFS_ERROR(EINVAL); + + if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + unaligned_io = 1; + + if (unaligned_io || mapping->nrpages || pos > ip->i_size) + *iolock = XFS_IOLOCK_EXCL; + else + *iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; + + if (mapping->nrpages) { + WARN_ON(*iolock != XFS_IOLOCK_EXCL); + ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, + FI_REMAPF_LOCKED); + if (ret) + return ret; } - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); - -start: - error = -generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; + /* + * If we are doing unaligned IO, wait for all other IO to drain, + * otherwise demote the lock if we had to flush cached pages + */ + if (unaligned_io) + xfs_ioend_wait(ip); + else if (*iolock == XFS_IOLOCK_EXCL) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + *iolock = XFS_IOLOCK_SHARED; } - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_direct_write(iocb, iovp, + &nr_segs, pos, &iocb->ki_pos, count, ocount); - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } + /* No fallback to buffered IO on errors for XFS. */ + ASSERT(ret < 0 || ret == count); + return ret; +} - if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); - goto start; - } - } +STATIC ssize_t +xfs_file_buffered_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; + size_t count = ocount; - new_size = pos + count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - if (likely(!(ioflags & IO_INVIS))) - file_update_time(file); + ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + if (ret) + return ret; + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + +write_retry: + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_buffered_write(iocb, iovp, nr_segs, + pos, &iocb->ki_pos, count, ret); /* - * If the offset is beyond the size of the file, we have a couple - * of things to do. First, if there is already space allocated - * we need to either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need - * to zero it out up to the new size. + * if we just got an ENOSPC, flush the inode now we aren't holding any + * page locks and retry *once* */ - - if (pos > ip->i_size) { - error = xfs_zero_eof(ip, pos, ip->i_size); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_unlock_internal; - } + if (ret == -ENOSPC && !enospc) { + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (ret) + return ret; + enospc = 1; + goto write_retry; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); + current->backing_dev_info = NULL; + return ret; +} - /* - * If we're writing the file then make sure to clear the - * setuid and setgid bits if the process is not being run - * by root. This keeps people from modifying setuid and - * setgid binaries. - */ - error = -file_remove_suid(file); - if (unlikely(error)) - goto out_unlock_internal; +STATIC ssize_t +xfs_file_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int iolock; + size_t ocount = 0; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + XFS_STATS_INC(xs_write_calls); - if ((ioflags & IO_ISDIRECT)) { - if (mapping->nrpages) { - WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(ip, - (pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_internal; - } + BUG_ON(iocb->ki_pos != pos); - if (need_i_mutex) { - /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); + ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + if (ret) + return ret; - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } + if (ocount == 0) + return 0; - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; - pos += ret; - count -= ret; + if (unlikely(file->f_flags & O_DIRECT)) + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); + else + ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, + ocount, &iolock); - ioflags &= ~IO_ISDIRECT; - xfs_iunlock(ip, iolock); - goto relock; - } - } else { - int enospc = 0; - ssize_t ret2 = 0; + xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); -write_retry: - trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); - ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); - /* - * if we just got an ENOSPC, flush the inode now we - * aren't holding any page locks and retry *once* - */ - if (ret2 == -ENOSPC && !enospc) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (error) - goto out_unlock_internal; - enospc = 1; - goto write_retry; - } - ret = ret2; - } + if (ret <= 0) + goto out_unlock; - current->backing_dev_info = NULL; + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; + int error, error2; - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) - iocb->ki_pos = isize; + xfs_rw_iunlock(ip, iolock); + error = filemap_write_and_wait_range(mapping, pos, end); + xfs_rw_ilock(ip, iolock); - if (iocb->ki_pos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (iocb->ki_pos > ip->i_size) - ip->i_size = iocb->ki_pos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error2 = -xfs_file_fsync(file, + (file->f_flags & __O_SYNC) ? 0 : 1); + if (error) + ret = error; + else if (error2) + ret = error2; } - error = -ret; - if (ret <= 0) - goto out_unlock_internal; +out_unlock: + xfs_aio_write_newsize_update(ip); + xfs_rw_iunlock(ip, iolock); + return ret; +} - XFS_STATS_ADD(xs_write_bytes, ret); +STATIC long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + long error; + loff_t new_size = 0; + xfs_flock64_t bf; + xfs_inode_t *ip = XFS_I(inode); + int cmd = XFS_IOC_RESVSP; - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error2; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; - xfs_iunlock(ip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); + bf.l_whence = 0; + bf.l_start = offset; + bf.l_len = len; - error2 = filemap_write_and_wait_range(mapping, pos, end); - if (!error) - error = error2; - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, iolock); + xfs_ilock(ip, XFS_IOLOCK_EXCL); - error2 = -xfs_file_fsync(file, - (file->f_flags & __O_SYNC) ? 0 : 1); - if (!error) - error = error2; + if (mode & FALLOC_FL_PUNCH_HOLE) + cmd = XFS_IOC_UNRESVSP; + + /* check the new inode size is valid before allocating */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; } - out_unlock_internal: - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - /* - * If this was a direct or synchronous I/O that failed (such - * as ENOSPC) then part of the I/O may have been written to - * disk before the error occured. In this case the on-disk - * file size may have been adjusted beyond the in-memory file - * size and now needs to be truncated back. - */ - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); + if (error) + goto out_unlock; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); } - xfs_iunlock(ip, iolock); - out_unlock_mutex: - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - return -error; + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; } + STATIC int xfs_file_open( struct inode *inode, @@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = { .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index ad442d9e392..0ca0e3c024d 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -39,6 +39,7 @@ #include "xfs_dfrag.h" #include "xfs_fsops.h" #include "xfs_vnodeops.h" +#include "xfs_discard.h" #include "xfs_quota.h" #include "xfs_inode_item.h" #include "xfs_export.h" @@ -694,14 +695,19 @@ xfs_ioc_fsgeometry_v1( xfs_mount_t *mp, void __user *arg) { - xfs_fsop_geom_v1_t fsgeo; + xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); + error = xfs_fs_geometry(mp, &fsgeo, 3); if (error) return -error; - if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + /* + * Caller should have passed an argument of type + * xfs_fsop_geom_v1_t. This is a proper subset of the + * xfs_fsop_geom_t that xfs_fs_geometry() fills in. + */ + if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) return -XFS_ERROR(EFAULT); return 0; } @@ -984,10 +990,22 @@ xfs_ioctl_setattr( /* * Extent size must be a multiple of the appropriate block - * size, if set at all. + * size, if set at all. It must also be smaller than the + * maximum extent size supported by the filesystem. + * + * Also, for non-realtime files, limit the extent size hint to + * half the size of the AGs in the filesystem so alignment + * doesn't result in extents larger than an AG. */ if (fa->fsx_extsize != 0) { - xfs_extlen_t size; + xfs_extlen_t size; + xfs_fsblock_t extsize_fsb; + + extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); + if (extsize_fsb > MAXEXTLEN) { + code = XFS_ERROR(EINVAL); + goto error_return; + } if (XFS_IS_REALTIME_INODE(ip) || ((mask & FSX_XFLAGS) && @@ -996,6 +1014,10 @@ xfs_ioctl_setattr( mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; + if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { + code = XFS_ERROR(EINVAL); + goto error_return; + } } if (fa->fsx_extsize % size) { @@ -1294,6 +1316,8 @@ xfs_file_ioctl( trace_xfs_file_ioctl(ip); switch (cmd) { + case FITRIM: + return xfs_ioc_trim(mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index d9298cf6026..9ff7fc603d2 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -46,7 +46,6 @@ #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/security.h> -#include <linux/falloc.h> #include <linux/fiemap.h> #include <linux/slab.h> @@ -506,58 +505,6 @@ xfs_vn_setattr( return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); } -STATIC long -xfs_vn_fallocate( - struct inode *inode, - int mode, - loff_t offset, - loff_t len) -{ - long error; - loff_t new_size = 0; - xfs_flock64_t bf; - xfs_inode_t *ip = XFS_I(inode); - - /* preallocation on directories not yet supported */ - error = -ENODEV; - if (S_ISDIR(inode->i_mode)) - goto out_error; - - bf.l_whence = 0; - bf.l_start = offset; - bf.l_len = len; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - /* check the new inode size is valid before allocating */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } - - error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, - 0, XFS_ATTR_NOLOCK); - if (error) - goto out_unlock; - - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); - } - -out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); -out_error: - return error; -} - #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) /* @@ -651,7 +598,6 @@ static const struct inode_operations xfs_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .fallocate = xfs_vn_fallocate, .fiemap = xfs_vn_fiemap, }; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 214ddd71ff7..09649499774 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -37,7 +37,6 @@ #include <kmem.h> #include <mrlock.h> -#include <sv.h> #include <time.h> #include <support/debug.h> diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 064f964d4f3..9731898083a 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -606,7 +606,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); + *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + mp); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); printk("XFS: Invalid device [%s], error=%d\n", name, error); @@ -620,7 +621,7 @@ xfs_blkdev_put( struct block_device *bdev) { if (bdev) - close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } /* @@ -834,8 +835,11 @@ xfsaild_wakeup( struct xfs_ail *ailp, xfs_lsn_t threshold_lsn) { - ailp->xa_target = threshold_lsn; - wake_up_process(ailp->xa_task); + /* only ever move the target forwards */ + if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) { + ailp->xa_target = threshold_lsn; + wake_up_process(ailp->xa_task); + } } STATIC int @@ -847,8 +851,17 @@ xfsaild( long tout = 0; /* milliseconds */ while (!kthread_should_stop()) { - schedule_timeout_interruptible(tout ? - msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); + /* + * for short sleeps indicating congestion, don't allow us to + * get woken early. Otherwise all we do is bang on the AIL lock + * without making progress. + */ + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(tout ? + msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); /* swsusp */ try_to_freeze(); @@ -935,7 +948,7 @@ out_reclaim: * Slab object creation initialisation for the XFS inode. * This covers only the idempotent fields in the XFS inode; * all other fields need to be initialised on allocation - * from the slab. This avoids the need to repeatedly intialise + * from the slab. This avoids the need to repeatedly initialise * fields in the xfs inode that left in the initialise state * when freeing the inode. */ @@ -1118,6 +1131,8 @@ xfs_fs_evict_inode( */ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_reclaimable, "xfs_iolock_reclaimable"); xfs_inactive(ip); } @@ -1399,7 +1414,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return -xfs_fs_log_dummy(mp, SYNC_WAIT); + return -xfs_fs_log_dummy(mp); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index afb0d7cfad1..e22f0057d21 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab( { struct inode *inode = VFS_I(ip); + ASSERT(rcu_read_lock_held()); + + /* + * check for stale RCU freed inode + * + * If the inode has been reallocated, it doesn't matter if it's not in + * the AG we are walking - we are walking for writeback, so if it + * passes all the "valid inode" checks and is dirty, then we'll write + * it back anyway. If it has been reallocated and still being + * initialised, the XFS_INEW check below will catch it. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + /* nothing to sync during shutdown */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return EFSCORRUPTED; - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) - return ENOENT; - /* If we can't grab the inode, it must on it's way to reclaim. */ if (!igrab(inode)) return ENOENT; @@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab( /* inode is valid */ return 0; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return ENOENT; } STATIC int @@ -98,12 +118,12 @@ restart: int error = 0; int i; - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); if (!nr_found) { - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); break; } @@ -118,18 +138,26 @@ restart: batch[i] = NULL; /* - * Update the index for the next lookup. Catch overflows - * into the next AG range which can occur if we have inodes - * in the last block of the AG and we are currently - * pointing to the last inode. + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) done = 1; } /* unlock now we've grabbed the inodes. */ - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); for (i = 0; i < nr_found; i++) { if (!batch[i]) @@ -334,7 +362,7 @@ xfs_quiesce_data( /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); + error2 = xfs_fs_log_dummy(mp); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -475,13 +503,14 @@ xfs_sync_worker( int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_log_force(mp, 0); - xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ - error = xfs_qm_sync(mp, SYNC_TRYLOCK); if (mp->m_super->s_frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp, 0); + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + xfs_reclaim_inodes(mp, 0); + error = xfs_qm_sync(mp, SYNC_TRYLOCK); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); @@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag( struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); } @@ -639,9 +668,14 @@ xfs_reclaim_inode_grab( struct xfs_inode *ip, int flags) { + ASSERT(rcu_read_lock_held()); + + /* quick check for stale RCU freed inode */ + if (!ip->i_ino) + return 1; /* - * do some unlocked checks first to avoid unnecceary lock traffic. + * do some unlocked checks first to avoid unnecessary lock traffic. * The first is a flush lock check, the second is a already in reclaim * check. Only do these checks if we are not going to block on locks. */ @@ -654,11 +688,16 @@ xfs_reclaim_inode_grab( * The radix tree lock here protects a thread in xfs_iget from racing * with us starting reclaim on the inode. Once we have the * XFS_IRECLAIM flag set it will not touch us. + * + * Due to RCU lookup, we may find inodes that have been freed and only + * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that + * aren't candidates for reclaim at all, so we must check the + * XFS_IRECLAIMABLE is set first before proceeding to reclaim. */ spin_lock(&ip->i_flags_lock); - ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { - /* ignore as it is already under reclaim */ + if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || + __xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* not a reclaim candidate. */ spin_unlock(&ip->i_flags_lock); return 1; } @@ -795,12 +834,12 @@ reclaim: * added to the tree assert that it's been there before to catch * problems with the inode life time early on. */ - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) ASSERT(0); __xfs_inode_clear_reclaim(pag, ip); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); /* * Here we do an (almost) spurious inode lock in order to coordinate @@ -864,14 +903,14 @@ restart: struct xfs_inode *batch[XFS_LOOKUP_BATCH]; int i; - write_lock(&pag->pag_ici_lock); + rcu_read_lock(); nr_found = radix_tree_gang_lookup_tag( &pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH, XFS_ICI_RECLAIM_TAG); if (!nr_found) { - write_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); break; } @@ -891,14 +930,24 @@ restart: * occur if we have inodes in the last block of * the AG and we are currently pointing to the * last inode. + * + * Because we may see inodes that are from the + * wrong AG due to RCU freeing and + * reallocation, only update the index if it + * lies in this AG. It was a race that lead us + * to see this inode, so another lookup from + * the same index will not find it again. */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != + pag->pag_agno) + continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) done = 1; } /* unlock now we've grabbed the inodes. */ - write_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); for (i = 0; i < nr_found; i++) { if (!batch[i]) diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index 7bb5092d6ae..ee3cee097e7 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -18,6 +18,7 @@ #include "xfs.h" #include <linux/sysctl.h> #include <linux/proc_fs.h> +#include "xfs_error.h" static struct ctl_table_header *xfs_table_header; @@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler( return ret; } + +STATIC int +xfs_panic_mask_proc_handler( + ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret, *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + if (!ret && write) { + xfs_panic_mask = *valp; +#ifdef DEBUG + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); +#endif + } + return ret; +} #endif /* CONFIG_PROC_FS */ static ctl_table xfs_table[] = { @@ -77,7 +98,7 @@ static ctl_table xfs_table[] = { .data = &xfs_params.panic_mask.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_panic_mask_proc_handler, .extra1 = &xfs_params.panic_mask.min, .extra2 = &xfs_params.panic_mask.max }, diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index acef2e98c59..2d0bcb47907 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __field(int, curr_res) __field(int, unit_res) __field(unsigned int, flags) - __field(void *, reserve_headq) - __field(void *, write_headq) + __field(int, reserveq) + __field(int, writeq) __field(int, grant_reserve_cycle) __field(int, grant_reserve_bytes) __field(int, grant_write_cycle) @@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_res = tic->t_curr_res; __entry->unit_res = tic->t_unit_res; __entry->flags = tic->t_flags; - __entry->reserve_headq = log->l_reserve_headq; - __entry->write_headq = log->l_write_headq; - __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; - __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; - __entry->grant_write_cycle = log->l_grant_write_cycle; - __entry->grant_write_bytes = log->l_grant_write_bytes; + __entry->reserveq = list_empty(&log->l_reserveq); + __entry->writeq = list_empty(&log->l_writeq); + xlog_crack_grant_head(&log->l_grant_reserve_head, + &__entry->grant_reserve_cycle, + &__entry->grant_reserve_bytes); + xlog_crack_grant_head(&log->l_grant_write_head, + &__entry->grant_write_cycle, + &__entry->grant_write_bytes); __entry->curr_cycle = log->l_curr_cycle; __entry->curr_block = log->l_curr_block; - __entry->tail_lsn = log->l_tail_lsn; + __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); ), TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " - "t_unit_res %u t_flags %s reserve_headq 0x%p " - "write_headq 0x%p grant_reserve_cycle %d " + "t_unit_res %u t_flags %s reserveq %s " + "writeq %s grant_reserve_cycle %d " "grant_reserve_bytes %d grant_write_cycle %d " "grant_write_bytes %d curr_cycle %d curr_block %d " "tail_cycle %d tail_block %d", @@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_res, __entry->unit_res, __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), - __entry->reserve_headq, - __entry->write_headq, + __entry->reserveq ? "empty" : "active", + __entry->writeq ? "empty" : "active", __entry->grant_reserve_cycle, __entry->grant_reserve_bytes, __entry->grant_write_cycle, @@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); @@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); @@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); -DECLARE_EVENT_CLASS(xfs_iomap_class, +DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, - int flags, struct xfs_bmbt_irec *irec), - TP_ARGS(ip, offset, count, flags, irec), + int type, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, type, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class, __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) - __field(int, flags) + __field(int, type) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class, __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; - __entry->flags = flags; + __entry->type = type; __entry->startoff = irec ? irec->br_startoff : 0; __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " - "offset 0x%llx count %zd flags %s " + "offset 0x%llx count %zd type %s " "startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class, __entry->new_size, __entry->offset, __entry->count, - __print_flags(__entry->flags, "|", BMAPI_FLAGS), + __print_symbolic(__entry->type, XFS_IO_TYPES), __entry->startoff, (__int64_t)__entry->startblock, __entry->blockcount) ) #define DEFINE_IOMAP_EVENT(name) \ -DEFINE_EVENT(xfs_iomap_class, name, \ +DEFINE_EVENT(xfs_imap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int flags, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, flags, irec)) -DEFINE_IOMAP_EVENT(xfs_iomap_enter); -DEFINE_IOMAP_EVENT(xfs_iomap_found); -DEFINE_IOMAP_EVENT(xfs_iomap_alloc); + int type, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, type, irec)) +DEFINE_IOMAP_EVENT(xfs_map_blocks_found); +DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_get_blocks_found); +DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \ TP_ARGS(ip, offset, count)) DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); +DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); TRACE_EVENT(xfs_itruncate_start, @@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \ TP_PROTO(struct xfs_alloc_arg *args), \ TP_ARGS(args)) DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); DEFINE_ALLOC_EVENT(xfs_alloc_near_first); @@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); +DECLARE_EVENT_CLASS(xfs_discard_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +) + +#define DEFINE_DISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_discard_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_DISCARD_EVENT(xfs_discard_extent); +DEFINE_DISCARD_EVENT(xfs_discard_toosmall); +DEFINE_DISCARD_EVENT(xfs_discard_exclude); +DEFINE_DISCARD_EVENT(xfs_discard_busy); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index faf8e1a83a1..d22aa310310 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -149,7 +149,6 @@ xfs_qm_dqdestroy( ASSERT(list_empty(&dqp->q_freelist)); mutex_destroy(&dqp->q_qlock); - sv_destroy(&dqp->q_pinwait); kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); atomic_dec(&xfs_Gqm->qm_totaldquots); diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index f8e854b4fde..206a2815ced 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -1863,12 +1863,14 @@ xfs_qm_dqreclaim_one(void) xfs_dquot_t *dqpout; xfs_dquot_t *dqp; int restarts; + int startagain; restarts = 0; dqpout = NULL; /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ -startagain: +again: + startagain = 0; mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { @@ -1885,13 +1887,10 @@ startagain: ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); trace_xfs_dqreclaim_want(dqp); - - xfs_dqunlock(dqp); - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto startagain; + restarts++; + startagain = 1; + goto dqunlock; } /* @@ -1906,23 +1905,20 @@ startagain: ASSERT(list_empty(&dqp->q_mplist)); list_del_init(&dqp->q_freelist); xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); dqpout = dqp; XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - break; + goto dqunlock; } ASSERT(dqp->q_hash); ASSERT(!list_empty(&dqp->q_mplist)); /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. + * Try to grab the flush lock. If this dquot is in the process + * of getting flushed to disk, we don't want to reclaim it. */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - continue; - } + if (!xfs_dqflock_nowait(dqp)) + goto dqunlock; /* * We have the flush lock so we know that this is not in the @@ -1944,8 +1940,7 @@ startagain: xfs_fs_cmn_err(CE_WARN, mp, "xfs_qm_dqreclaim: dquot %p flush failed", dqp); } - xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - continue; + goto dqunlock; } /* @@ -1967,13 +1962,8 @@ startagain: */ if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { restarts++; - mutex_unlock(&dqp->q_hash->qh_lock); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; - goto startagain; + startagain = 1; + goto qhunlock; } ASSERT(dqp->q_nrefs == 0); @@ -1986,14 +1976,20 @@ startagain: xfs_Gqm->qm_dqfrlist_cnt--; dqpout = dqp; mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); +qhunlock: mutex_unlock(&dqp->q_hash->qh_lock); dqfunlock: xfs_dqfunlock(dqp); +dqunlock: xfs_dqunlock(dqp); if (dqpout) break; if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; + break; + if (startagain) { + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + goto again; + } } mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); return dqpout; diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 975aa10e1a4..0df88897ef8 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -25,86 +25,78 @@ #include "xfs_mount.h" #include "xfs_error.h" -static char message[1024]; /* keep it off the stack */ -static DEFINE_SPINLOCK(xfs_err_lock); - -/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ -#define XFS_MAX_ERR_LEVEL 7 -#define XFS_ERR_MASK ((1 << 3) - 1) -static const char * const err_level[XFS_MAX_ERR_LEVEL+1] = - {KERN_EMERG, KERN_ALERT, KERN_CRIT, - KERN_ERR, KERN_WARNING, KERN_NOTICE, - KERN_INFO, KERN_DEBUG}; - void -cmn_err(register int level, char *fmt, ...) +cmn_err( + const char *lvl, + const char *fmt, + ...) { - char *fp = fmt; - int len; - ulong flags; - va_list ap; - - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; - spin_lock_irqsave(&xfs_err_lock,flags); - va_start(ap, fmt); - if (*fmt == '!') fp++; - len = vsnprintf(message, sizeof(message), fp, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; - printk("%s%s\n", err_level[level], message); - va_end(ap); - spin_unlock_irqrestore(&xfs_err_lock,flags); - BUG_ON(level == CE_PANIC); + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s%pV", lvl, &vaf); + va_end(args); + + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); } void -xfs_fs_vcmn_err( - int level, +xfs_fs_cmn_err( + const char *lvl, struct xfs_mount *mp, - char *fmt, - va_list ap) + const char *fmt, + ...) { - unsigned long flags; - int len = 0; + struct va_format vaf; + va_list args; - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - spin_lock_irqsave(&xfs_err_lock,flags); + printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf); + va_end(args); - if (mp) { - len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); + BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0); +} + +/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */ +void +xfs_cmn_err( + int panic_tag, + const char *lvl, + struct xfs_mount *mp, + const char *fmt, + ...) +{ + struct va_format vaf; + va_list args; + int do_panic = 0; - /* - * Skip the printk if we can't print anything useful - * due to an over-long device name. - */ - if (len >= sizeof(message)) - goto out; + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + printk(KERN_ALERT "XFS: Transforming an alert into a BUG."); + do_panic = 1; } - len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); - if (len >= sizeof(message)) - len = sizeof(message) - 1; - if (message[len-1] == '\n') - message[len-1] = 0; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; - printk("%s%s\n", err_level[level], message); - out: - spin_unlock_irqrestore(&xfs_err_lock,flags); + printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf); + va_end(args); - BUG_ON(level == CE_PANIC); + BUG_ON(do_panic); } void assfail(char *expr, char *file, int line) { - printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); + printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr, + file, line); BUG(); } diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index d2d20462fd4..05699f67d47 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -20,15 +20,22 @@ #include <stdarg.h> -#define CE_DEBUG 7 /* debug */ -#define CE_CONT 6 /* continuation */ -#define CE_NOTE 5 /* notice */ -#define CE_WARN 4 /* warning */ -#define CE_ALERT 1 /* alert */ -#define CE_PANIC 0 /* panic */ - -extern void cmn_err(int, char *, ...) - __attribute__ ((format (printf, 2, 3))); +struct xfs_mount; + +#define CE_DEBUG KERN_DEBUG +#define CE_CONT KERN_INFO +#define CE_NOTE KERN_NOTICE +#define CE_WARN KERN_WARNING +#define CE_ALERT KERN_ALERT +#define CE_PANIC KERN_EMERG + +void cmn_err(const char *lvl, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); +void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp, + const char *fmt, ...) __attribute__ ((format (printf, 4, 5))); + extern void assfail(char *expr, char *f, int l); #define ASSERT_ALWAYS(expr) \ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 63c7a1a6c02..58632cc17f2 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -227,7 +227,7 @@ typedef struct xfs_perag { atomic_t pagf_fstrms; /* # of filestreams active in this AG */ - rwlock_t pag_ici_lock; /* incore inode lock */ + spinlock_t pag_ici_lock; /* incore inode cache lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ int pag_ici_reclaimable; /* reclaimable inodes */ struct mutex pag_ici_reclaim_lock; /* serialisation point */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 112abc439ca..f3227984a9b 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -41,10 +41,6 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -static int -xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); - /* * Prototypes for per-ag allocation routines */ @@ -94,7 +90,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -STATIC int /* error */ +int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -127,7 +123,7 @@ xfs_alloc_update( /* * Get the data from the pointed-to record. */ -STATIC int /* error */ +int /* error */ xfs_alloc_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t *bno, /* output: starting block of extent */ @@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact( xfs_extlen_t rlen; /* length of returned extent */ ASSERT(args->alignment == 1); + /* * Allocate/initialize a cursor for the by-number freespace btree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + args->agno, XFS_BTNUM_BNO); + /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). * Look for the closest free block <= bno, it must contain bno * if any free block does. */ - if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) + error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); + if (error) goto error0; - if (!i) { - /* - * Didn't find it, return null. - */ - xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); - args->agbno = NULLAGBLOCK; - return 0; - } + if (!i) + goto not_found; + /* * Grab the freespace record. */ - if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) + error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); + if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); ASSERT(fbno <= args->agbno); minend = args->agbno + args->minlen; maxend = args->agbno + args->maxlen; fend = fbno + flen; + /* * Give up if the freespace isn't long enough for the minimum request. */ - if (fend < minend) { - xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); - args->agbno = NULLAGBLOCK; - return 0; - } + if (fend < minend) + goto not_found; + /* * End of extent will be smaller of the freespace end and the * maximal requested end. - */ - end = XFS_AGBLOCK_MIN(fend, maxend); - /* + * * Fix the length according to mod and prod if given. */ + end = XFS_AGBLOCK_MIN(fend, maxend); args->len = end - args->agbno; xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); - return 0; - } + if (!xfs_alloc_fix_minleft(args)) + goto not_found; + rlen = args->len; ASSERT(args->agbno + rlen <= fend); end = args->agbno + rlen; + /* * We are allocating agbno for rlen [agbno .. end] * Allocate/initialize a cursor for the by-size btree. @@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact( args->agno, XFS_BTNUM_CNT); ASSERT(args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); - if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, - args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { + error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, + args->len, XFSA_FIXUP_BNO_OK); + if (error) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); goto error0; } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - trace_xfs_alloc_exact_done(args); args->wasfromfl = 0; + trace_xfs_alloc_exact_done(args); + return 0; + +not_found: + /* Didn't find it, return null. */ + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_exact_notfound(args); return 0; error0: @@ -659,6 +661,95 @@ error0: } /* + * Search the btree in a given direction via the search cursor and compare + * the records found against the good extent we've already found. + */ +STATIC int +xfs_alloc_find_best_extent( + struct xfs_alloc_arg *args, /* allocation argument structure */ + struct xfs_btree_cur **gcur, /* good cursor */ + struct xfs_btree_cur **scur, /* searching cursor */ + xfs_agblock_t gdiff, /* difference for search comparison */ + xfs_agblock_t *sbno, /* extent found by search */ + xfs_extlen_t *slen, + xfs_extlen_t *slena, /* aligned length */ + int dir) /* 0 = search right, 1 = search left */ +{ + xfs_agblock_t bno; + xfs_agblock_t new; + xfs_agblock_t sdiff; + int error; + int i; + + /* The good extent is perfect, no need to search. */ + if (!gdiff) + goto out_use_good; + + /* + * Look until we find a better one, run out of space or run off the end. + */ + do { + error = xfs_alloc_get_rec(*scur, sbno, slen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(*sbno, *slen, args->alignment, + args->minlen, &bno, slena); + + /* + * The good extent is closer than this one. + */ + if (!dir) { + if (bno >= args->agbno + gdiff) + goto out_use_good; + } else { + if (bno <= args->agbno - gdiff) + goto out_use_good; + } + + /* + * Same distance, compare length and pick the best. + */ + if (*slena >= args->minlen) { + args->len = XFS_EXTLEN_MIN(*slena, args->maxlen); + xfs_alloc_fix_len(args); + + sdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, *sbno, + *slen, &new); + + /* + * Choose closer size and invalidate other cursor. + */ + if (sdiff < gdiff) + goto out_use_search; + goto out_use_good; + } + + if (!dir) + error = xfs_btree_increment(*scur, 0, &i); + else + error = xfs_btree_decrement(*scur, 0, &i); + if (error) + goto error0; + } while (i); + +out_use_good: + xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR); + *scur = NULL; + return 0; + +out_use_search: + xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR); + *gcur = NULL; + return 0; + +error0: + /* caller invalidates cursors */ + return error; +} + +/* * Allocate a variable extent near bno in the allocation group agno. * Extent's length (returned in len) will be between minlen and maxlen, * and of the form k * prod + mod unless there's nothing that large. @@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near( } } } while (bno_cur_lt || bno_cur_gt); + /* * Got both cursors still active, need to find better entry. */ if (bno_cur_lt && bno_cur_gt) { - /* - * Left side is long enough, look for a right side entry. - */ if (ltlena >= args->minlen) { /* - * Fix up the length. + * Left side is good, look for a right side entry. */ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); - rlen = args->len; - ltdiff = xfs_alloc_compute_diff(args->agbno, rlen, + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, args->alignment, ltbno, ltlen, <new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_lt, &bno_cur_gt, + ltdiff, >bno, >len, >lena, + 0 /* search right */); + } else { + ASSERT(gtlena >= args->minlen); + /* - * Not perfect. - */ - if (ltdiff) { - /* - * Look until we find a better one, run out of - * space, or run off the end. - */ - while (bno_cur_lt && bno_cur_gt) { - if ((error = xfs_alloc_get_rec( - bno_cur_gt, >bno, - >len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_alloc_compute_aligned(gtbno, gtlen, - args->alignment, args->minlen, - >bnoa, >lena); - /* - * The left one is clearly better. - */ - if (gtbnoa >= args->agbno + ltdiff) { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - break; - } - /* - * If we reach a big enough entry, - * compare the two and pick the best. - */ - if (gtlena >= args->minlen) { - args->len = - XFS_EXTLEN_MIN(gtlena, - args->maxlen); - xfs_alloc_fix_len(args); - rlen = args->len; - gtdiff = xfs_alloc_compute_diff( - args->agbno, rlen, - args->alignment, - gtbno, gtlen, >new); - /* - * Right side is better. - */ - if (gtdiff < ltdiff) { - xfs_btree_del_cursor( - bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } - /* - * Left side is better. - */ - else { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - break; - } - /* - * Fell off the right end. - */ - if ((error = xfs_btree_increment( - bno_cur_gt, 0, &i))) - goto error0; - if (!i) { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - break; - } - } - } - /* - * The left side is perfect, trash the right side. - */ - else { - xfs_btree_del_cursor(bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - } - /* - * It's the right side that was found first, look left. - */ - else { - /* - * Fix up the length. + * Right side is good, look for a left side entry. */ args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); - rlen = args->len; - gtdiff = xfs_alloc_compute_diff(args->agbno, rlen, + gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, args->alignment, gtbno, gtlen, >new); - /* - * Right side entry isn't perfect. - */ - if (gtdiff) { - /* - * Look until we find a better one, run out of - * space, or run off the end. - */ - while (bno_cur_lt && bno_cur_gt) { - if ((error = xfs_alloc_get_rec( - bno_cur_lt, <bno, - <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_alloc_compute_aligned(ltbno, ltlen, - args->alignment, args->minlen, - <bnoa, <lena); - /* - * The right one is clearly better. - */ - if (ltbnoa <= args->agbno - gtdiff) { - xfs_btree_del_cursor( - bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - break; - } - /* - * If we reach a big enough entry, - * compare the two and pick the best. - */ - if (ltlena >= args->minlen) { - args->len = XFS_EXTLEN_MIN( - ltlena, args->maxlen); - xfs_alloc_fix_len(args); - rlen = args->len; - ltdiff = xfs_alloc_compute_diff( - args->agbno, rlen, - args->alignment, - ltbno, ltlen, <new); - /* - * Left side is better. - */ - if (ltdiff < gtdiff) { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - /* - * Right side is better. - */ - else { - xfs_btree_del_cursor( - bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } - break; - } - /* - * Fell off the left end. - */ - if ((error = xfs_btree_decrement( - bno_cur_lt, 0, &i))) - goto error0; - if (!i) { - xfs_btree_del_cursor(bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - break; - } - } - } - /* - * The right side is perfect, trash the left side. - */ - else { - xfs_btree_del_cursor(bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } + + error = xfs_alloc_find_best_extent(args, + &bno_cur_gt, &bno_cur_lt, + gtdiff, <bno, <len, <lena, + 1 /* search left */); } + + if (error) + goto error0; } + /* * If we couldn't get anything, give up. */ @@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near( args->agbno = NULLAGBLOCK; return 0; } + /* * At this point we have selected a freespace entry, either to the * left or to the right. If it's on the right, copy all the @@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near( j = 1; } else j = 0; + /* * Fix up the length and compute the useful address. */ @@ -2676,7 +2611,7 @@ restart: * will require a synchronous transaction, but it can still be * used to distinguish between a partial or exact match. */ -static int +int xfs_alloc_busy_search( struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 895009a9727..d0b3bc72005 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -19,6 +19,7 @@ #define __XFS_ALLOC_H__ struct xfs_buf; +struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; @@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) /* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks + * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +#define XFS_ALLOC_AG_MAX_USABLE(mp) \ + ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) + + +/* * Argument structure for xfs_alloc routines. * This is turned into a structure to avoid having 20 arguments passed * down several levels of the stack. @@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag); #ifdef __KERNEL__ - void -xfs_alloc_busy_insert(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); void xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); +int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); #endif /* __KERNEL__ */ /* @@ -205,4 +222,18 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index a6cff8edcdb..71e90dc2aeb 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) * It didn't all fit, so we have to sort everything on hashval. */ sbsize = sf->hdr.count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); /* * Scan the attribute list for the rest of the entries, storing @@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) args.dp = context->dp; args.whichfork = XFS_ATTR_FORK; args.valuelen = valuelen; - args.value = kmem_alloc(valuelen, KM_SLEEP); + args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 4111cd3966c..dc3afd7739f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real( * Filling in the middle part of a previous delayed allocation. * Contiguity is impossible here. * This case is avoided almost all the time. + * + * We start with a delayed allocation: + * + * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+ + * PREV @ idx + * + * and we are allocating: + * +rrrrrrrrrrrrrrrrr+ + * new + * + * and we set it up for insertion as: + * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+ + * new + * PREV @ idx LEFT RIGHT + * inserted at idx + 1 */ temp = new->br_startoff - PREV.br_startoff; - trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - r[0] = *new; - r[1].br_state = PREV.br_state; - r[1].br_startblock = 0; - r[1].br_startoff = new_endoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - r[1].br_blockcount = temp2; - xfs_iext_insert(ip, idx + 1, 2, &r[0], state); + trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ + LEFT = *new; + RIGHT.br_state = PREV.br_state; + RIGHT.br_startblock = nullstartblock( + (int)xfs_bmap_worst_indlen(ip, temp2)); + RIGHT.br_startoff = new_endoff; + RIGHT.br_blockcount = temp2; + /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ + xfs_iext_insert(ip, idx + 1, 2, &LEFT, state); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) @@ -2430,7 +2447,7 @@ xfs_bmap_btalloc_nullfb( startag = ag = 0; pag = xfs_perag_get(mp, ag); - while (*blen < ap->alen) { + while (*blen < args->maxlen) { if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, args->tp, ag, XFS_ALLOC_FLAG_TRYLOCK); @@ -2452,7 +2469,7 @@ xfs_bmap_btalloc_nullfb( notinit = 1; if (xfs_inode_is_filestream(ap->ip)) { - if (*blen >= ap->alen) + if (*blen >= args->maxlen) break; if (ap->userdata) { @@ -2498,14 +2515,14 @@ xfs_bmap_btalloc_nullfb( * If the best seen length is less than the request * length, use the best as the minimum. */ - else if (*blen < ap->alen) + else if (*blen < args->maxlen) args->minlen = *blen; /* - * Otherwise we've seen an extent as big as alen, + * Otherwise we've seen an extent as big as maxlen, * use that as the minimum. */ else - args->minlen = ap->alen; + args->minlen = args->maxlen; /* * set the failure fallback case to look in the selected @@ -2573,7 +2590,9 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->rval; - args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); + + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp)); args.firstblock = ap->firstblock; blen = 0; if (nullfb) { @@ -2621,7 +2640,7 @@ xfs_bmap_btalloc( /* * Adjust for alignment */ - if (blen > args.alignment && blen <= ap->alen) + if (blen > args.alignment && blen <= args.maxlen) args.minlen = blen - args.alignment; args.minalignslop = 0; } else { @@ -2640,7 +2659,7 @@ xfs_bmap_btalloc( * of minlen+alignment+slop doesn't go up * between the calls. */ - if (blen > mp->m_dalign && blen <= ap->alen) + if (blen > mp->m_dalign && blen <= args.maxlen) nextminlen = blen - mp->m_dalign; else nextminlen = args.minlen; @@ -4485,6 +4504,16 @@ xfs_bmapi( /* Figure out the extent size, adjust alen */ extsz = xfs_get_extsz_hint(ip); if (extsz) { + /* + * make sure we don't exceed a single + * extent length when we align the + * extent by reducing length we are + * going to allocate by the maximum + * amount extent size aligment may + * require. + */ + alen = XFS_FILBLKS_MIN(len, + MAXEXTLEN - (2 * extsz - 1)); error = xfs_bmap_extsize_align(mp, &got, &prev, extsz, rt, eof, diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 04f9cca8da7..2f9e97c128a 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -634,9 +634,8 @@ xfs_btree_read_bufl( return error; } ASSERT(!bp || !XFS_BUF_GETERROR(bp)); - if (bp != NULL) { + if (bp) XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); - } *bpp = bp; return 0; } @@ -944,13 +943,13 @@ xfs_btree_set_refs( switch (cur->bc_btnum) { case XFS_BTNUM_BNO: case XFS_BTNUM_CNT: - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF); break; case XFS_BTNUM_INO: - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); + XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF); break; case XFS_BTNUM_BMAP: - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF); break; default: ASSERT(0); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 2686d0d54c5..6f8c21ce0d6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -141,8 +141,7 @@ xfs_buf_item_log_check( #define xfs_buf_item_log_check(x) #endif -STATIC void xfs_buf_error_relse(xfs_buf_t *bp); -STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); +STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* * This returns the number of log iovecs needed to log the @@ -428,13 +427,15 @@ xfs_buf_item_unpin( if (remove) { /* - * We have to remove the log item from the transaction - * as we are about to release our reference to the - * buffer. If we don't, the unlock that occurs later - * in xfs_trans_uncommit() will ry to reference the + * If we are in a transaction context, we have to + * remove the log item from the transaction as we are + * about to release our reference to the buffer. If we + * don't, the unlock that occurs later in + * xfs_trans_uncommit() will try to reference the * buffer which we no longer have a hold on. */ - xfs_trans_del_item(lip); + if (lip->li_desc) + xfs_trans_del_item(lip); /* * Since the transaction no longer refers to the buffer, @@ -450,7 +451,7 @@ xfs_buf_item_unpin( * xfs_trans_ail_delete() drops the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { - xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); + xfs_buf_do_callbacks(bp); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); } else { @@ -918,15 +919,26 @@ xfs_buf_attach_iodone( XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); } +/* + * We can have many callbacks on a buffer. Running the callbacks individually + * can cause a lot of contention on the AIL lock, so we allow for a single + * callback to be able to scan the remaining lip->li_bio_list for other items + * of the same type and callback to be processed in the first call. + * + * As a result, the loop walking the callback list below will also modify the + * list. it removes the first item from the list and then runs the callback. + * The loop then restarts from the new head of the list. This allows the + * callback to scan and modify the list attached to the buffer and we don't + * have to care about maintaining a next item pointer. + */ STATIC void xfs_buf_do_callbacks( - xfs_buf_t *bp, - xfs_log_item_t *lip) + struct xfs_buf *bp) { - xfs_log_item_t *nlip; + struct xfs_log_item *lip; - while (lip != NULL) { - nlip = lip->li_bio_list; + while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) { + XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list); ASSERT(lip->li_cb != NULL); /* * Clear the next pointer so we don't have any @@ -936,7 +948,6 @@ xfs_buf_do_callbacks( */ lip->li_bio_list = NULL; lip->li_cb(bp, lip); - lip = nlip; } } @@ -949,128 +960,76 @@ xfs_buf_do_callbacks( */ void xfs_buf_iodone_callbacks( - xfs_buf_t *bp) + struct xfs_buf *bp) { - xfs_log_item_t *lip; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - xfs_mount_t *mp; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_mount *mp = lip->li_mountp; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (likely(!XFS_BUF_GETERROR(bp))) + goto do_callbacks; - if (XFS_BUF_GETERROR(bp) != 0) { - /* - * If we've already decided to shutdown the filesystem - * because of IO errors, there's no point in giving this - * a retry. - */ - mp = lip->li_mountp; - if (XFS_FORCED_SHUTDOWN(mp)) { - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - XFS_BUF_SUPER_STALE(bp); - trace_xfs_buf_item_iodone(bp, _RET_IP_); - xfs_buf_do_callbacks(bp, lip); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); - return; - } + /* + * If we've already decided to shutdown the filesystem because of + * I/O errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + XFS_BUF_SUPER_STALE(bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); + goto do_callbacks; + } - if ((XFS_BUF_TARGET(bp) != lasttarg) || - (time_after(jiffies, (lasttime + 5*HZ)))) { - lasttime = jiffies; - cmn_err(CE_ALERT, "Device %s, XFS metadata write error" - " block 0x%llx in %s", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); - } - lasttarg = XFS_BUF_TARGET(bp); + if (XFS_BUF_TARGET(bp) != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + cmn_err(CE_ALERT, "Device %s, XFS metadata write error" + " block 0x%llx in %s", + XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); + } + lasttarg = XFS_BUF_TARGET(bp); - if (XFS_BUF_ISASYNC(bp)) { - /* - * If the write was asynchronous then noone will be - * looking for the error. Clear the error state - * and write the buffer out again delayed write. - * - * XXXsup This is OK, so long as we catch these - * before we start the umount; we don't want these - * DELWRI metadata bufs to be hanging around. - */ - XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ - - if (!(XFS_BUF_ISSTALE(bp))) { - XFS_BUF_DELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); - } - ASSERT(XFS_BUF_IODONE_FUNC(bp)); - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); - } else { - /* - * If the write of the buffer was not asynchronous, - * then we want to make sure to return the error - * to the caller of bwrite(). Because of this we - * cannot clear the B_ERROR state at this point. - * Instead we install a callback function that - * will be called when the buffer is released, and - * that routine will clear the error state and - * set the buffer to be written out again after - * some delay. - */ - /* We actually overwrite the existing b-relse - function at times, but we're gonna be shutting down - anyway. */ - XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); + /* + * If the write was asynchronous then noone will be looking for the + * error. Clear the error state and write the buffer out again. + * + * During sync or umount we'll write all pending buffers again + * synchronous, which will catch these errors if they keep hanging + * around. + */ + if (XFS_BUF_ISASYNC(bp)) { + XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + + if (!XFS_BUF_ISSTALE(bp)) { + XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_FINISH_IOWAIT(bp); + XFS_BUF_SET_START(bp); } + ASSERT(XFS_BUF_IODONE_FUNC(bp)); + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_relse(bp); return; } - xfs_buf_do_callbacks(bp, lip); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_buf_ioend(bp, 0); -} - -/* - * This is a callback routine attached to a buffer which gets an error - * when being written out synchronously. - */ -STATIC void -xfs_buf_error_relse( - xfs_buf_t *bp) -{ - xfs_log_item_t *lip; - xfs_mount_t *mp; - - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - mp = (xfs_mount_t *)lip->li_mountp; - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - + /* + * If the write of the buffer was synchronous, we want to make + * sure to return the error to the caller of xfs_bwrite(). + */ XFS_BUF_STALE(bp); XFS_BUF_DONE(bp); XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_ERROR(bp,0); trace_xfs_buf_error_relse(bp, _RET_IP_); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - if (! XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - /* - * We have to unpin the pinned buffers so do the - * callbacks. - */ - xfs_buf_do_callbacks(bp, lip); +do_callbacks: + xfs_buf_do_callbacks(bp); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_SET_BRELSE_FUNC(bp,NULL); - xfs_buf_relse(bp); + xfs_buf_ioend(bp, 0); } - /* * This is the iodone() function for buffers which have been * logged. It is called when they are eventually flushed out. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 0e2ed43f16c..b6ecd2061e7 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item { xfs_buf_log_format_t bli_format; /* in-log header */ } xfs_buf_log_item_t; -/* - * This structure is used during recovery to record the buf log - * items which have been canceled and should not be replayed. - */ -typedef struct xfs_buf_cancel { - xfs_daddr_t bc_blkno; - uint bc_len; - int bc_refcount; - struct xfs_buf_cancel *bc_next; -} xfs_buf_cancel_t; - void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c78cc6a3d87..4c7db74a05f 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) } #endif /* DEBUG */ - -void -xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - -void -xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) -{ - va_list ap; - -#ifdef DEBUG - xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); -#endif - - if (xfs_panic_mask && (xfs_panic_mask & panic_tag) - && (level & CE_ALERT)) { - level &= ~CE_ALERT; - level |= CE_PANIC; - cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG."); - } - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); -} - void xfs_error_report( const char *tag, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index f338847f80b..10dce5475f0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); +extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) #define xfs_errortag_add(tag, mp) (ENOSYS) @@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); struct xfs_mount; -extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp, - char *fmt, va_list ap) - __attribute__ ((format (printf, 3, 0))); -extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, - char *fmt, ...) - __attribute__ ((format (printf, 4, 5))); -extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); - extern void xfs_hex_dump(void *p, int length); #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) #define xfs_fs_mount_cmn_err(f, fmt, args...) \ - ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) + do { \ + if (!(f & XFS_MFSI_QUIET)) \ + cmn_err(CE_WARN, "XFS: " fmt, ## args); \ + } while (0) #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a55e687bf56..d22e6262343 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -48,6 +48,28 @@ xfs_efi_item_free( } /* + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the + * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees + * the EFI. + */ +STATIC void +__xfs_efi_release( + struct xfs_efi_log_item *efip) +{ + struct xfs_ail *ailp = efip->efi_item.li_ailp; + + if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, &efip->efi_item); + xfs_efi_item_free(efip); + } +} + +/* * This returns the number of iovecs needed to log the given efi item. * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. @@ -74,7 +96,8 @@ xfs_efi_item_format( struct xfs_efi_log_item *efip = EFI_ITEM(lip); uint size; - ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); + ASSERT(atomic_read(&efip->efi_next_extent) == + efip->efi_format.efi_nextents); efip->efi_format.efi_type = XFS_LI_EFI; @@ -99,10 +122,12 @@ xfs_efi_item_pin( } /* - * While EFIs cannot really be pinned, the unpin operation is the - * last place at which the EFI is manipulated during a transaction. - * Here we coordinate with xfs_efi_cancel() to determine who gets to - * free the EFI. + * While EFIs cannot really be pinned, the unpin operation is the last place at + * which the EFI is manipulated during a transaction. If we are being asked to + * remove the EFI it's because the transaction has been cancelled and by + * definition that means the EFI cannot be in the AIL so remove it from the + * transaction and free it. Otherwise coordinate with xfs_efi_release() (via + * XFS_EFI_COMMITTED) to determine who gets to free the EFI. */ STATIC void xfs_efi_item_unpin( @@ -110,20 +135,15 @@ xfs_efi_item_unpin( int remove) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - struct xfs_ail *ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); - if (efip->efi_flags & XFS_EFI_CANCELED) { - if (remove) + if (remove) { + ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); + if (lip->li_desc) xfs_trans_del_item(lip); - - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, lip); xfs_efi_item_free(efip); - } else { - efip->efi_flags |= XFS_EFI_COMMITTED; - spin_unlock(&ailp->xa_lock); + return; } + __xfs_efi_release(efip); } /* @@ -152,16 +172,20 @@ xfs_efi_item_unlock( } /* - * The EFI is logged only once and cannot be moved in the log, so - * simply return the lsn at which it's been logged. The canceled - * flag is not paid any attention here. Checking for that is delayed - * until the EFI is unpinned. + * The EFI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. For bulk transaction committed + * processing, the EFI may be processed but not yet unpinned prior to the EFD + * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected + * when processing the EFD. */ STATIC xfs_lsn_t xfs_efi_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + + set_bit(XFS_EFI_COMMITTED, &efip->efi_flags); return lsn; } @@ -230,6 +254,7 @@ xfs_efi_init( xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; + atomic_set(&efip->efi_next_extent, 0); return efip; } @@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } /* - * This is called by the efd item code below to release references to - * the given efi item. Each efd calls this with the number of - * extents that it has logged, and when the sum of these reaches - * the total number of extents logged by this efi item we can free - * the efi item. - * - * Freeing the efi item requires that we remove it from the AIL. - * We'll use the AIL lock to protect our counters as well as - * the removal from the AIL. + * This is called by the efd item code below to release references to the given + * efi item. Each efd calls this with the number of extents that it has + * logged, and when the sum of these reaches the total number of extents logged + * by this efi item we can free the efi item. */ void xfs_efi_release(xfs_efi_log_item_t *efip, uint nextents) { - struct xfs_ail *ailp = efip->efi_item.li_ailp; - int extents_left; - - ASSERT(efip->efi_next_extent > 0); - ASSERT(efip->efi_flags & XFS_EFI_COMMITTED); - - spin_lock(&ailp->xa_lock); - ASSERT(efip->efi_next_extent >= nextents); - efip->efi_next_extent -= nextents; - extents_left = efip->efi_next_extent; - if (extents_left == 0) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); - xfs_efi_item_free(efip); - } else { - spin_unlock(&ailp->xa_lock); - } + ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) + __xfs_efi_release(efip); } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 0d22c56fdf6..375f68e4253 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 { #define XFS_EFI_MAX_FAST_EXTENTS 16 /* - * Define EFI flags. + * Define EFI flag bits. Manipulated by set/clear/test_bit operators. */ -#define XFS_EFI_RECOVERED 0x1 -#define XFS_EFI_COMMITTED 0x2 -#define XFS_EFI_CANCELED 0x4 +#define XFS_EFI_RECOVERED 1 +#define XFS_EFI_COMMITTED 2 /* * This is the "extent free intention" log item. It is used @@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 { */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; - uint efi_flags; /* misc flags */ - uint efi_next_extent; + atomic_t efi_next_extent; + unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; } xfs_efi_log_item_t; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a7c116e814a..85668efb3e3 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -53,6 +53,9 @@ xfs_fs_geometry( xfs_fsop_geom_t *geo, int new_version) { + + memset(geo, 0, sizeof(*geo)); + geo->blocksize = mp->m_sb.sb_blocksize; geo->rtextsize = mp->m_sb.sb_rextsize; geo->agblocks = mp->m_sb.sb_agblocks; @@ -374,6 +377,7 @@ xfs_growfs_data_private( mp->m_maxicount = icount << mp->m_sb.sb_inopblog; } else mp->m_maxicount = 0; + xfs_set_low_space_thresholds(mp); /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { @@ -611,12 +615,13 @@ out: * * We cannot use an inode here for this - that will push dirty state back up * into the VFS and then periodic inode flushing will prevent log covering from - * making progress. Hence we log a field in the superblock instead. + * making progress. Hence we log a field in the superblock instead and use a + * synchronous transaction to ensure the superblock is immediately unpinned + * and can be written back. */ int xfs_fs_log_dummy( - xfs_mount_t *mp, - int flags) + xfs_mount_t *mp) { xfs_trans_t *tp; int error; @@ -631,8 +636,7 @@ xfs_fs_log_dummy( /* log the UUID because it is an unchanging field */ xfs_mod_sb(tp, XFS_SB_UUID); - if (flags & SYNC_WAIT) - xfs_trans_set_sync(tp); + xfs_trans_set_sync(tp); return xfs_trans_commit(tp, 0); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index a786c5212c1..1b6a98b6688 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); +extern int xfs_fs_log_dummy(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index d7de5a3f786..cb9b6d1469f 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -43,6 +43,17 @@ /* + * Define xfs inode iolock lockdep classes. We need to ensure that all active + * inodes are considered the same for lockdep purposes, including inodes that + * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to + * guarantee the locks are considered the same when there are multiple lock + * initialisation siteѕ. Also, define a reclaimable inode class so it is + * obvious in lockdep reports which class the report is against. + */ +static struct lock_class_key xfs_iolock_active; +struct lock_class_key xfs_iolock_reclaimable; + +/* * Allocate and initialise an xfs_inode. */ STATIC struct xfs_inode * @@ -69,8 +80,11 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); + ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_active, "xfs_iolock_active"); /* initialise the xfs inode */ ip->i_ino = ino; @@ -85,9 +99,6 @@ xfs_inode_alloc( ip->i_size = 0; ip->i_new_size = 0; - /* prevent anyone from using this yet */ - VFS_I(ip)->i_state = I_NEW; - return ip; } @@ -145,7 +156,18 @@ xfs_inode_free( ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); - call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback); + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } /* @@ -155,14 +177,29 @@ static int xfs_iget_cache_hit( struct xfs_perag *pag, struct xfs_inode *ip, + xfs_ino_t ino, int flags, - int lock_flags) __releases(pag->pag_ici_lock) + int lock_flags) __releases(RCU) { struct inode *inode = VFS_I(ip); struct xfs_mount *mp = ip->i_mount; int error; + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + /* * If we are racing with another cache hit that is currently @@ -205,7 +242,7 @@ xfs_iget_cache_hit( ip->i_flags |= XFS_IRECLAIM; spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); error = -inode_init_always(mp->m_super, inode); if (error) { @@ -213,7 +250,7 @@ xfs_iget_cache_hit( * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); spin_lock(&ip->i_flags_lock); ip->i_flags &= ~XFS_INEW; @@ -223,14 +260,20 @@ xfs_iget_cache_hit( goto out_error; } - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); ip->i_flags |= XFS_INEW; __xfs_inode_clear_reclaim_tag(mp, pag, ip); inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + lockdep_set_class_and_name(&ip->i_iolock.mr_lock, + &xfs_iolock_active, "xfs_iolock_active"); + spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { @@ -241,7 +284,7 @@ xfs_iget_cache_hit( /* We've got a live one. */ spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); trace_xfs_iget_hit(ip); } @@ -255,7 +298,7 @@ xfs_iget_cache_hit( out_error: spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); return error; } @@ -308,7 +351,7 @@ xfs_iget_cache_miss( BUG(); } - write_lock(&pag->pag_ici_lock); + spin_lock(&pag->pag_ici_lock); /* insert the new inode */ error = radix_tree_insert(&pag->pag_ici_root, agino, ip); @@ -323,14 +366,14 @@ xfs_iget_cache_miss( ip->i_udquot = ip->i_gdquot = NULL; xfs_iflags_set(ip, XFS_INEW); - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); *ipp = ip; return 0; out_preload_end: - write_unlock(&pag->pag_ici_lock); + spin_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); if (lock_flags) xfs_iunlock(ip, lock_flags); @@ -377,7 +420,7 @@ xfs_iget( xfs_agino_t agino; /* reject inode numbers outside existing AGs */ - if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return EINVAL; /* get the perag structure and ensure that it's inode capable */ @@ -386,15 +429,15 @@ xfs_iget( again: error = 0; - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, agino); if (ip) { - error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); if (error) goto out_error_or_again; } else { - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); XFS_STATS_INC(xs_ig_missed); error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 108c7a085f9..be7cf625421 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -887,7 +887,7 @@ xfs_iread( * around for a while. This helps to keep recently accessed * meta-data in-core longer. */ - XFS_BUF_SET_REF(bp, XFS_INO_REF); + xfs_buf_set_ref(bp, XFS_INO_REF); /* * Use xfs_trans_brelse() to release the buffer containing the @@ -2000,17 +2000,33 @@ xfs_ifree_cluster( */ for (i = 0; i < ninodes; i++) { retry: - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, (inum + i))); - /* Inode not in memory or stale, nothing to do */ - if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { - read_unlock(&pag->pag_ici_lock); + /* Inode not in memory, nothing to do */ + if (!ip) { + rcu_read_unlock(); continue; } /* + * because this is an RCU protected lookup, we could + * find a recently freed or even reallocated inode + * during the lookup. We need to check under the + * i_flags_lock for a valid inode here. Skip it if it + * is not valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum + i || + __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* * Don't try to lock/unlock the current inode, but we * _cannot_ skip the other inodes that we did not find * in the list attached to the buffer and are not @@ -2019,11 +2035,11 @@ retry: */ if (ip != free_ip && !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); delay(1); goto retry; } - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); xfs_iflock(ip); xfs_iflags_set(ip, XFS_ISTALE); @@ -2629,7 +2645,7 @@ xfs_iflush_cluster( mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - read_lock(&pag->pag_ici_lock); + rcu_read_lock(); /* really need a gang lookup range call here */ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, first_index, inodes_per_cluster); @@ -2640,9 +2656,21 @@ xfs_iflush_cluster( iq = ilist[i]; if (iq == ip) continue; - /* if the inode lies outside this cluster, we're done. */ - if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) - break; + + /* + * because this is an RCU protected lookup, we could find a + * recently freed or even reallocated inode during the lookup. + * We need to check under the i_flags_lock for a valid inode + * here. Skip it if it is not valid or the wrong inode. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino || + (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); + /* * Do an un-protected check to see if the inode is dirty and * is a candidate for flushing. These checks will be repeated @@ -2692,7 +2720,7 @@ xfs_iflush_cluster( } out_free: - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); kmem_free(ilist); out_put: xfs_perag_put(pag); @@ -2704,7 +2732,7 @@ cluster_corrupt_out: * Corruption detected in the clustering loop. Invalidate the * inode buffer and shut down the filesystem. */ - read_unlock(&pag->pag_ici_lock); + rcu_read_unlock(); /* * Clean up the buffer. If it was B_DELWRI, just release it -- * brelse can handle it with no problems. If not, shut down the diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index fb2ca2e4cdc..5c95fa8ec11 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) /* * In-core inode flags. */ -#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ -#define XFS_ISTALE 0x0002 /* inode has been staled */ -#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ -#define XFS_INEW 0x0008 /* inode has just been allocated */ -#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ -#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ +#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ +#define XFS_ISTALE 0x0002 /* inode has been staled */ +#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ +#define XFS_INEW 0x0008 /* inode has just been allocated */ +#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ +#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ /* * Flags for inode locking. @@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) #define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) +extern struct lock_class_key xfs_iolock_reclaimable; + /* * Flags for xfs_itruncate_start(). */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7c8d30c453c..fd4f398bd6f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -842,15 +842,64 @@ xfs_inode_item_destroy( * flushed to disk. It is responsible for removing the inode item * from the AIL if it has not been re-logged, and unlocking the inode's * flush lock. + * + * To reduce AIL lock traffic as much as possible, we scan the buffer log item + * list for other inodes that will run this function. We remove them from the + * buffer list so we can process all the inode IO completions in one AIL lock + * traversal. */ void xfs_iflush_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - xfs_inode_t *ip = iip->ili_inode; + struct xfs_inode_log_item *iip; + struct xfs_log_item *blip; + struct xfs_log_item *next; + struct xfs_log_item *prev; struct xfs_ail *ailp = lip->li_ailp; + int need_ail = 0; + + /* + * Scan the buffer IO completions for other inodes being completed and + * attach them to the current inode log item. + */ + blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + prev = NULL; + while (blip != NULL) { + if (lip->li_cb != xfs_iflush_done) { + prev = blip; + blip = blip->li_bio_list; + continue; + } + + /* remove from list */ + next = blip->li_bio_list; + if (!prev) { + XFS_BUF_SET_FSPRIVATE(bp, next); + } else { + prev->li_bio_list = next; + } + + /* add to current list */ + blip->li_bio_list = lip->li_bio_list; + lip->li_bio_list = blip; + + /* + * while we have the item, do the unlocked check for needing + * the AIL lock. + */ + iip = INODE_ITEM(blip); + if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + need_ail++; + + blip = next; + } + + /* make sure we capture the state of the initial inode. */ + iip = INODE_ITEM(lip); + if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + need_ail++; /* * We only want to pull the item from the AIL if it is @@ -861,28 +910,37 @@ xfs_iflush_done( * the lock since it's cheaper, and then we recheck while * holding the lock before removing the inode from the AIL. */ - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { + if (need_ail) { + struct xfs_log_item *log_items[need_ail]; + int i = 0; spin_lock(&ailp->xa_lock); - if (lip->li_lsn == iip->ili_flush_lsn) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, lip); - } else { - spin_unlock(&ailp->xa_lock); + for (blip = lip; blip; blip = blip->li_bio_list) { + iip = INODE_ITEM(blip); + if (iip->ili_logged && + blip->li_lsn == iip->ili_flush_lsn) { + log_items[i++] = blip; + } + ASSERT(i <= need_ail); } + /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ + xfs_trans_ail_delete_bulk(ailp, log_items, i); } - iip->ili_logged = 0; /* - * Clear the ili_last_fields bits now that we know that the - * data corresponding to them is safely on disk. + * clean up and unlock the flush lock now we are done. We can clear the + * ili_last_fields bits now that we know that the data corresponding to + * them is safely on disk. */ - iip->ili_last_fields = 0; + for (blip = lip; blip; blip = next) { + next = blip->li_bio_list; + blip->li_bio_list = NULL; - /* - * Release the inode's flush lock since we're done with it. - */ - xfs_ifunlock(ip); + iip = INODE_ITEM(blip); + iip->ili_logged = 0; + iip->ili_last_fields = 0; + xfs_ifunlock(iip->ili_inode); + } } /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 20576146369..8a0f044750c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -47,127 +47,8 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -#define XFS_STRAT_WRITE_IMAPS 2 #define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP -STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - int, struct xfs_bmbt_irec *, int *); -STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *); -STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int *); - -int -xfs_iomap( - struct xfs_inode *ip, - xfs_off_t offset, - ssize_t count, - int flags, - struct xfs_bmbt_irec *imap, - int *nimaps, - int *new) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, end_fsb; - int error = 0; - int lockmode = 0; - int bmapi_flags = 0; - - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - - *new = 0; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - trace_xfs_iomap_enter(ip, offset, count, flags, NULL); - - switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) { - case BMAPI_READ: - lockmode = xfs_ilock_map_shared(ip); - bmapi_flags = XFS_BMAPI_ENTIRE; - break; - case BMAPI_WRITE: - lockmode = XFS_ILOCK_EXCL; - if (flags & BMAPI_IGNSTATE) - bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE; - xfs_ilock(ip, lockmode); - break; - case BMAPI_ALLOCATE: - lockmode = XFS_ILOCK_SHARED; - bmapi_flags = XFS_BMAPI_ENTIRE; - - /* Attempt non-blocking lock */ - if (flags & BMAPI_TRYLOCK) { - if (!xfs_ilock_nowait(ip, lockmode)) - return XFS_ERROR(EAGAIN); - } else { - xfs_ilock(ip, lockmode); - } - break; - default: - BUG(); - } - - ASSERT(offset <= mp->m_maxioffset); - if ((xfs_fsize_t)offset + count > mp->m_maxioffset) - count = mp->m_maxioffset - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - - error = xfs_bmapi(NULL, ip, offset_fsb, - (xfs_filblks_t)(end_fsb - offset_fsb), - bmapi_flags, NULL, 0, imap, - nimaps, NULL); - - if (error) - goto out; - - switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { - case BMAPI_WRITE: - /* If we found an extent, return it */ - if (*nimaps && - (imap->br_startblock != HOLESTARTBLOCK) && - (imap->br_startblock != DELAYSTARTBLOCK)) { - trace_xfs_iomap_found(ip, offset, count, flags, imap); - break; - } - - if (flags & BMAPI_DIRECT) { - error = xfs_iomap_write_direct(ip, offset, count, flags, - imap, nimaps); - } else { - error = xfs_iomap_write_delay(ip, offset, count, flags, - imap, nimaps); - } - if (!error) { - trace_xfs_iomap_alloc(ip, offset, count, flags, imap); - } - *new = 1; - break; - case BMAPI_ALLOCATE: - /* If we found an extent, return it */ - xfs_iunlock(ip, lockmode); - lockmode = 0; - - if (*nimaps && !isnullstartblock(imap->br_startblock)) { - trace_xfs_iomap_found(ip, offset, count, flags, imap); - break; - } - - error = xfs_iomap_write_allocate(ip, offset, count, - imap, nimaps); - break; - } - - ASSERT(*nimaps <= 1); - -out: - if (lockmode) - xfs_iunlock(ip, lockmode); - return XFS_ERROR(error); -} - STATIC int xfs_iomap_eof_align_last_fsb( xfs_mount_t *mp, @@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero( return EFSCORRUPTED; } -STATIC int +int xfs_iomap_write_direct( xfs_inode_t *ip, xfs_off_t offset, size_t count, - int flags, xfs_bmbt_irec_t *imap, - int *nmaps) + int nmaps) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -279,7 +159,7 @@ xfs_iomap_write_direct( if (error) goto error_out; } else { - if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) + if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) imap->br_blockcount + imap->br_startoff); @@ -331,7 +211,7 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip); bmapi_flag = XFS_BMAPI_WRITE; - if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) + if (offset < ip->i_size || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* @@ -370,7 +250,6 @@ xfs_iomap_write_direct( goto error_out; } - *nmaps = 1; return 0; error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ @@ -379,7 +258,6 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - *nmaps = 0; /* nothing set-up here */ error_out: return XFS_ERROR(error); @@ -389,6 +267,9 @@ error_out: * If the caller is doing a write at the end of the file, then extend the * allocation out to the file system's write iosize. We clean up any extra * space left over when the file is closed in xfs_inactive(). + * + * If we find we already have delalloc preallocation beyond EOF, don't do more + * preallocation as it it not needed. */ STATIC int xfs_iomap_eof_want_preallocate( @@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate( xfs_inode_t *ip, xfs_off_t offset, size_t count, - int ioflag, xfs_bmbt_irec_t *imap, int nimaps, int *prealloc) @@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate( xfs_filblks_t count_fsb; xfs_fsblock_t firstblock; int n, error, imaps; + int found_delalloc = 0; *prealloc = 0; if ((offset + count) <= ip->i_size) @@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate( return 0; start_fsb += imap[n].br_blockcount; count_fsb -= imap[n].br_blockcount; + + if (imap[n].br_startblock == DELAYSTARTBLOCK) + found_delalloc = 1; } } - *prealloc = 1; + if (!found_delalloc) + *prealloc = 1; return 0; } -STATIC int +/* + * If we don't have a user specified preallocation size, dynamically increase + * the preallocation size as the size of the file grows. Cap the maximum size + * at a single extent or less if the filesystem is near full. The closer the + * filesystem is to full, the smaller the maximum prealocation. + */ +STATIC xfs_fsblock_t +xfs_iomap_prealloc_size( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + xfs_fsblock_t alloc_blocks = 0; + + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + int shift = 0; + int64_t freesp; + + /* + * rounddown_pow_of_two() returns an undefined result + * if we pass in alloc_blocks = 0. Hence the "+ 1" to + * ensure we always pass in a non-zero value. + */ + alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; + alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, + rounddown_pow_of_two(alloc_blocks)); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + freesp = mp->m_sb.sb_fdblocks; + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; + } + if (shift) + alloc_blocks >>= shift; + } + + if (alloc_blocks < mp->m_writeio_blocks) + alloc_blocks = mp->m_writeio_blocks; + + return alloc_blocks; +} + +int xfs_iomap_write_delay( xfs_inode_t *ip, xfs_off_t offset, size_t count, - int ioflag, - xfs_bmbt_irec_t *ret_imap, - int *nmaps) + xfs_bmbt_irec_t *ret_imap) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -469,16 +401,19 @@ xfs_iomap_write_delay( extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); + error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, - ioflag, imap, XFS_WRITE_IMAPS, &prealloc); + imap, XFS_WRITE_IMAPS, &prealloc); if (error) return error; retry: if (prealloc) { + xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip); + aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); - last_fsb = ioalign + mp->m_writeio_blocks; + last_fsb = ioalign + alloc_blocks; } else { last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); } @@ -496,22 +431,31 @@ retry: XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | XFS_BMAPI_ENTIRE, &firstblock, 1, imap, &nimaps, NULL); - if (error && (error != ENOSPC)) + switch (error) { + case 0: + case ENOSPC: + case EDQUOT: + break; + default: return XFS_ERROR(error); + } /* - * If bmapi returned us nothing, and if we didn't get back EDQUOT, - * then we must have run out of space - flush all other inodes with - * delalloc blocks and retry without EOF preallocation. + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For + * ENOSPC, * flush all other inodes with delalloc blocks to free up + * some of the excess reserved metadata space. For both cases, retry + * without EOF preallocation. */ if (nimaps == 0) { trace_xfs_delalloc_enospc(ip, offset, count); if (flushed) - return XFS_ERROR(ENOSPC); + return XFS_ERROR(error ? error : ENOSPC); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inodes(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); + if (error == ENOSPC) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_flush_inodes(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + } flushed = 1; error = 0; @@ -523,8 +467,6 @@ retry: return xfs_cmn_err_fsblock_zero(ip, &imap[0]); *ret_imap = imap[0]; - *nmaps = 1; - return 0; } @@ -538,13 +480,12 @@ retry: * We no longer bother to look at the incoming map - all we have to * guarantee is that whatever we allocate fills the required range. */ -STATIC int +int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, size_t count, - xfs_bmbt_irec_t *imap, - int *retmap) + xfs_bmbt_irec_t *imap) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb, last_block; @@ -557,8 +498,6 @@ xfs_iomap_write_allocate( int error = 0; int nres; - *retmap = 0; - /* * Make sure that the dquots are there. */ @@ -680,7 +619,6 @@ xfs_iomap_write_allocate( if ((offset_fsb >= imap->br_startoff) && (offset_fsb < (imap->br_startoff + imap->br_blockcount))) { - *retmap = 1; XFS_STATS_INC(xs_xstrat_quick); return 0; } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 7748a430f50..80615760959 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,30 +18,15 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -/* base extent manipulation calls */ -#define BMAPI_READ (1 << 0) /* read extents */ -#define BMAPI_WRITE (1 << 1) /* create extents */ -#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */ - -/* modifiers */ -#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */ -#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */ -#define BMAPI_MMA (1 << 6) /* allocate for mmap write */ -#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */ - -#define BMAPI_FLAGS \ - { BMAPI_READ, "READ" }, \ - { BMAPI_WRITE, "WRITE" }, \ - { BMAPI_ALLOCATE, "ALLOCATE" }, \ - { BMAPI_IGNSTATE, "IGNSTATE" }, \ - { BMAPI_DIRECT, "DIRECT" }, \ - { BMAPI_TRYLOCK, "TRYLOCK" } - struct xfs_inode; struct xfs_bmbt_irec; -extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, - struct xfs_bmbt_irec *, int *, int *); +extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int); +extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *); +extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *); extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index cee4ab9f8a9..ae6fef1ff56 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -47,7 +47,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, xfs_buftarg_t *log_target, xfs_daddr_t blk_offset, int num_bblks); -STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); +STATIC int xlog_space_left(struct log *log, atomic64_t *head); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); @@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); /* local functions to manipulate grant head */ STATIC int xlog_grant_log_space(xlog_t *log, xlog_ticket_t *xtic); -STATIC void xlog_grant_push_ail(xfs_mount_t *mp, +STATIC void xlog_grant_push_ail(struct log *log, int need_bytes); STATIC void xlog_regrant_reserve_log_space(xlog_t *log, xlog_ticket_t *ticket); @@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, #if defined(DEBUG) STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); -STATIC void xlog_verify_grant_head(xlog_t *log, int equals); +STATIC void xlog_verify_grant_tail(struct log *log); STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing); STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, xfs_lsn_t tail_lsn); #else #define xlog_verify_dest_ptr(a,b) -#define xlog_verify_grant_head(a,b) +#define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c,d) #define xlog_verify_tail_lsn(a,b,c) #endif STATIC int xlog_iclogs_empty(xlog_t *log); - static void -xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) +xlog_grant_sub_space( + struct log *log, + atomic64_t *head, + int bytes) { - if (*qp) { - tic->t_next = (*qp); - tic->t_prev = (*qp)->t_prev; - (*qp)->t_prev->t_next = tic; - (*qp)->t_prev = tic; - } else { - tic->t_prev = tic->t_next = tic; - *qp = tic; - } + int64_t head_val = atomic64_read(head); + int64_t new, old; - tic->t_flags |= XLOG_TIC_IN_Q; -} + do { + int cycle, space; -static void -xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) -{ - if (tic == tic->t_next) { - *qp = NULL; - } else { - *qp = tic->t_next; - tic->t_next->t_prev = tic->t_prev; - tic->t_prev->t_next = tic->t_next; - } + xlog_crack_grant_head_val(head_val, &cycle, &space); - tic->t_next = tic->t_prev = NULL; - tic->t_flags &= ~XLOG_TIC_IN_Q; + space -= bytes; + if (space < 0) { + space += log->l_logsize; + cycle--; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); } static void -xlog_grant_sub_space(struct log *log, int bytes) +xlog_grant_add_space( + struct log *log, + atomic64_t *head, + int bytes) { - log->l_grant_write_bytes -= bytes; - if (log->l_grant_write_bytes < 0) { - log->l_grant_write_bytes += log->l_logsize; - log->l_grant_write_cycle--; - } - - log->l_grant_reserve_bytes -= bytes; - if ((log)->l_grant_reserve_bytes < 0) { - log->l_grant_reserve_bytes += log->l_logsize; - log->l_grant_reserve_cycle--; - } + int64_t head_val = atomic64_read(head); + int64_t new, old; -} + do { + int tmp; + int cycle, space; -static void -xlog_grant_add_space_write(struct log *log, int bytes) -{ - int tmp = log->l_logsize - log->l_grant_write_bytes; - if (tmp > bytes) - log->l_grant_write_bytes += bytes; - else { - log->l_grant_write_cycle++; - log->l_grant_write_bytes = bytes - tmp; - } -} + xlog_crack_grant_head_val(head_val, &cycle, &space); -static void -xlog_grant_add_space_reserve(struct log *log, int bytes) -{ - int tmp = log->l_logsize - log->l_grant_reserve_bytes; - if (tmp > bytes) - log->l_grant_reserve_bytes += bytes; - else { - log->l_grant_reserve_cycle++; - log->l_grant_reserve_bytes = bytes - tmp; - } -} + tmp = log->l_logsize - space; + if (tmp > bytes) + space += bytes; + else { + space = bytes - tmp; + cycle++; + } -static inline void -xlog_grant_add_space(struct log *log, int bytes) -{ - xlog_grant_add_space_write(log, bytes); - xlog_grant_add_space_reserve(log, bytes); + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); } static void @@ -355,7 +330,7 @@ xfs_log_reserve( trace_xfs_log_reserve(log, internal_ticket); - xlog_grant_push_ail(mp, internal_ticket->t_unit_res); + xlog_grant_push_ail(log, internal_ticket->t_unit_res); retval = xlog_regrant_write_log_space(log, internal_ticket); } else { /* may sleep if need to allocate more tickets */ @@ -369,7 +344,7 @@ xfs_log_reserve( trace_xfs_log_reserve(log, internal_ticket); - xlog_grant_push_ail(mp, + xlog_grant_push_ail(log, (internal_ticket->t_unit_res * internal_ticket->t_cnt)); retval = xlog_grant_log_space(log, internal_ticket); @@ -402,7 +377,7 @@ xfs_log_mount( cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); else { cmn_err(CE_NOTE, - "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", + "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", mp->m_fsname); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } @@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (!(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY)) { if (!XLOG_FORCED_SHUTDOWN(log)) { - sv_wait(&iclog->ic_force_wait, PMEM, - &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); } else { spin_unlock(&log->l_icloglock); } @@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) || iclog->ic_state == XLOG_STATE_DIRTY || iclog->ic_state == XLOG_STATE_IOERROR) ) { - sv_wait(&iclog->ic_force_wait, PMEM, - &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); } else { spin_unlock(&log->l_icloglock); } @@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t *mp, { xlog_ticket_t *tic; xlog_t *log = mp->m_log; - int need_bytes, free_bytes, cycle, bytes; + int need_bytes, free_bytes; if (XLOG_FORCED_SHUTDOWN(log)) return; - if (tail_lsn == 0) { - /* needed since sync_lsn is 64 bits */ - spin_lock(&log->l_icloglock); - tail_lsn = log->l_last_sync_lsn; - spin_unlock(&log->l_icloglock); - } - - spin_lock(&log->l_grant_lock); + if (tail_lsn == 0) + tail_lsn = atomic64_read(&log->l_last_sync_lsn); - /* Also an invalid lsn. 1 implies that we aren't passing in a valid - * tail_lsn. - */ - if (tail_lsn != 1) { - log->l_tail_lsn = tail_lsn; - } + /* tail_lsn == 1 implies that we weren't passed a valid value. */ + if (tail_lsn != 1) + atomic64_set(&log->l_tail_lsn, tail_lsn); - if ((tic = log->l_write_headq)) { + if (!list_empty_careful(&log->l_writeq)) { #ifdef DEBUG if (log->l_flags & XLOG_ACTIVE_RECOVERY) panic("Recovery problem"); #endif - cycle = log->l_grant_write_cycle; - bytes = log->l_grant_write_bytes; - free_bytes = xlog_space_left(log, cycle, bytes); - do { + spin_lock(&log->l_grant_write_lock); + free_bytes = xlog_space_left(log, &log->l_grant_write_head); + list_for_each_entry(tic, &log->l_writeq, t_queue) { ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); if (free_bytes < tic->t_unit_res && tail_lsn != 1) break; tail_lsn = 0; free_bytes -= tic->t_unit_res; - sv_signal(&tic->t_wait); - tic = tic->t_next; - } while (tic != log->l_write_headq); + trace_xfs_log_regrant_write_wake_up(log, tic); + wake_up(&tic->t_wait); + } + spin_unlock(&log->l_grant_write_lock); } - if ((tic = log->l_reserve_headq)) { + + if (!list_empty_careful(&log->l_reserveq)) { #ifdef DEBUG if (log->l_flags & XLOG_ACTIVE_RECOVERY) panic("Recovery problem"); #endif - cycle = log->l_grant_reserve_cycle; - bytes = log->l_grant_reserve_bytes; - free_bytes = xlog_space_left(log, cycle, bytes); - do { + spin_lock(&log->l_grant_reserve_lock); + free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); + list_for_each_entry(tic, &log->l_reserveq, t_queue) { if (tic->t_flags & XLOG_TIC_PERM_RESERV) need_bytes = tic->t_unit_res*tic->t_cnt; else @@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t *mp, break; tail_lsn = 0; free_bytes -= need_bytes; - sv_signal(&tic->t_wait); - tic = tic->t_next; - } while (tic != log->l_reserve_headq); + trace_xfs_log_grant_wake_up(log, tic); + wake_up(&tic->t_wait); + } + spin_unlock(&log->l_grant_reserve_lock); } - spin_unlock(&log->l_grant_lock); -} /* xfs_log_move_tail */ +} /* * Determine if we have a transaction that has gone to disk @@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp) * We may be holding the log iclog lock upon entering this routine. */ xfs_lsn_t -xlog_assign_tail_lsn(xfs_mount_t *mp) +xlog_assign_tail_lsn( + struct xfs_mount *mp) { - xfs_lsn_t tail_lsn; - xlog_t *log = mp->m_log; + xfs_lsn_t tail_lsn; + struct log *log = mp->m_log; tail_lsn = xfs_trans_ail_tail(mp->m_ail); - spin_lock(&log->l_grant_lock); - if (tail_lsn != 0) { - log->l_tail_lsn = tail_lsn; - } else { - tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn; - } - spin_unlock(&log->l_grant_lock); + if (!tail_lsn) + tail_lsn = atomic64_read(&log->l_last_sync_lsn); + atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; -} /* xlog_assign_tail_lsn */ - +} /* * Return the space in the log between the tail and the head. The head @@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp) * result is that we return the size of the log as the amount of space left. */ STATIC int -xlog_space_left(xlog_t *log, int cycle, int bytes) -{ - int free_bytes; - int tail_bytes; - int tail_cycle; - - tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); - tail_cycle = CYCLE_LSN(log->l_tail_lsn); - if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { - free_bytes = log->l_logsize - (bytes - tail_bytes); - } else if ((tail_cycle + 1) < cycle) { +xlog_space_left( + struct log *log, + atomic64_t *head) +{ + int free_bytes; + int tail_bytes; + int tail_cycle; + int head_cycle; + int head_bytes; + + xlog_crack_grant_head(head, &head_cycle, &head_bytes); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); + tail_bytes = BBTOB(tail_bytes); + if (tail_cycle == head_cycle && head_bytes >= tail_bytes) + free_bytes = log->l_logsize - (head_bytes - tail_bytes); + else if (tail_cycle + 1 < head_cycle) return 0; - } else if (tail_cycle < cycle) { - ASSERT(tail_cycle == (cycle - 1)); - free_bytes = tail_bytes - bytes; + else if (tail_cycle < head_cycle) { + ASSERT(tail_cycle == (head_cycle - 1)); + free_bytes = tail_bytes - head_bytes; } else { /* * The reservation head is behind the tail. @@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes) "xlog_space_left: head behind tail\n" " tail_cycle = %d, tail_bytes = %d\n" " GH cycle = %d, GH bytes = %d", - tail_cycle, tail_bytes, cycle, bytes); + tail_cycle, tail_bytes, head_cycle, head_bytes); ASSERT(0); free_bytes = log->l_logsize; } return free_bytes; -} /* xlog_space_left */ +} /* @@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_flags |= XLOG_ACTIVE_RECOVERY; log->l_prev_block = -1; - log->l_tail_lsn = xlog_assign_lsn(1, 0); /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ - log->l_last_sync_lsn = log->l_tail_lsn; + xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ - log->l_grant_reserve_cycle = 1; - log->l_grant_write_cycle = 1; + xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0); + xlog_assign_grant_head(&log->l_grant_write_head, 1, 0); + INIT_LIST_HEAD(&log->l_reserveq); + INIT_LIST_HEAD(&log->l_writeq); + spin_lock_init(&log->l_grant_reserve_lock); + spin_lock_init(&log->l_grant_write_lock); error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { @@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); - spin_lock_init(&log->l_grant_lock); - sv_init(&log->l_flush_wait, 0, "flush_wait"); + init_waitqueue_head(&log->l_flush_wait); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t *mp, ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); - sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); - sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); + init_waitqueue_head(&iclog->ic_force_wait); + init_waitqueue_head(&iclog->ic_write_wait); iclogp = &iclog->ic_next; } @@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t *mp, out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - if (iclog->ic_bp) { - sv_destroy(&iclog->ic_force_wait); - sv_destroy(&iclog->ic_write_wait); + if (iclog->ic_bp) xfs_buf_free(iclog->ic_bp); - } kmem_free(iclog); } spinlock_destroy(&log->l_icloglock); - spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); out_free_log: kmem_free(log); @@ -1223,61 +1189,60 @@ xlog_commit_record( * water mark. In this manner, we would be creating a low water mark. */ STATIC void -xlog_grant_push_ail(xfs_mount_t *mp, - int need_bytes) +xlog_grant_push_ail( + struct log *log, + int need_bytes) { - xlog_t *log = mp->m_log; /* pointer to the log */ - xfs_lsn_t tail_lsn; /* lsn of the log tail */ - xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ - int free_blocks; /* free blocks left to write to */ - int free_bytes; /* free bytes left to write to */ - int threshold_block; /* block in lsn we'd like to be at */ - int threshold_cycle; /* lsn cycle we'd like to be at */ - int free_threshold; - - ASSERT(BTOBB(need_bytes) < log->l_logBBsize); - - spin_lock(&log->l_grant_lock); - free_bytes = xlog_space_left(log, - log->l_grant_reserve_cycle, - log->l_grant_reserve_bytes); - tail_lsn = log->l_tail_lsn; - free_blocks = BTOBBT(free_bytes); - - /* - * Set the threshold for the minimum number of free blocks in the - * log to the maximum of what the caller needs, one quarter of the - * log, and 256 blocks. - */ - free_threshold = BTOBB(need_bytes); - free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); - free_threshold = MAX(free_threshold, 256); - if (free_blocks < free_threshold) { - threshold_block = BLOCK_LSN(tail_lsn) + free_threshold; - threshold_cycle = CYCLE_LSN(tail_lsn); + xfs_lsn_t threshold_lsn = 0; + xfs_lsn_t last_sync_lsn; + int free_blocks; + int free_bytes; + int threshold_block; + int threshold_cycle; + int free_threshold; + + ASSERT(BTOBB(need_bytes) < log->l_logBBsize); + + free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); + free_blocks = BTOBBT(free_bytes); + + /* + * Set the threshold for the minimum number of free blocks in the + * log to the maximum of what the caller needs, one quarter of the + * log, and 256 blocks. + */ + free_threshold = BTOBB(need_bytes); + free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); + free_threshold = MAX(free_threshold, 256); + if (free_blocks >= free_threshold) + return; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, + &threshold_block); + threshold_block += free_threshold; if (threshold_block >= log->l_logBBsize) { - threshold_block -= log->l_logBBsize; - threshold_cycle += 1; + threshold_block -= log->l_logBBsize; + threshold_cycle += 1; } - threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block); + threshold_lsn = xlog_assign_lsn(threshold_cycle, + threshold_block); + /* + * Don't pass in an lsn greater than the lsn of the last + * log record known to be on disk. Use a snapshot of the last sync lsn + * so that it doesn't change between the compare and the set. + */ + last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) + threshold_lsn = last_sync_lsn; - /* Don't pass in an lsn greater than the lsn of the last - * log record known to be on disk. + /* + * Get the transaction layer to kick the dirty buffers out to + * disk asynchronously. No point in trying to do this if + * the filesystem is shutting down. */ - if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) - threshold_lsn = log->l_last_sync_lsn; - } - spin_unlock(&log->l_grant_lock); - - /* - * Get the transaction layer to kick the dirty buffers out to - * disk asynchronously. No point in trying to do this if - * the filesystem is shutting down. - */ - if (threshold_lsn && - !XLOG_FORCED_SHUTDOWN(log)) - xfs_trans_ail_push(log->l_ailp, threshold_lsn); -} /* xlog_grant_push_ail */ + if (!XLOG_FORCED_SHUTDOWN(log)) + xfs_trans_ail_push(log->l_ailp, threshold_lsn); +} /* * The bdstrat callback function for log bufs. This gives us a central @@ -1372,9 +1337,8 @@ xlog_sync(xlog_t *log, roundoff < BBTOB(1))); /* move grant heads by roundoff in sync */ - spin_lock(&log->l_grant_lock); - xlog_grant_add_space(log, roundoff); - spin_unlock(&log->l_grant_lock); + xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff); + xlog_grant_add_space(log, &log->l_grant_write_head, roundoff); /* put cycle number in every block */ xlog_pack_data(log, iclog, roundoff); @@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log) iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { - sv_destroy(&iclog->ic_force_wait); - sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); iclog = next_iclog; } spinlock_destroy(&log->l_icloglock); - spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); log->l_mp->m_log = NULL; @@ -2232,7 +2193,7 @@ xlog_state_do_callback( lowest_lsn = xlog_get_lowest_lsn(log); if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { + be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { iclog = iclog->ic_next; continue; /* Leave this iclog for * another thread */ @@ -2240,23 +2201,21 @@ xlog_state_do_callback( iclog->ic_state = XLOG_STATE_CALLBACK; - spin_unlock(&log->l_icloglock); - /* l_last_sync_lsn field protected by - * l_grant_lock. Don't worry about iclog's lsn. - * No one else can be here except us. + /* + * update the last_sync_lsn before we drop the + * icloglock to ensure we are the only one that + * can update it. */ - spin_lock(&log->l_grant_lock); - ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - log->l_last_sync_lsn = - be64_to_cpu(iclog->ic_header.h_lsn); - spin_unlock(&log->l_grant_lock); + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); - } else { - spin_unlock(&log->l_icloglock); + } else ioerrors++; - } + + spin_unlock(&log->l_icloglock); /* * Keep processing entries in the callback list until @@ -2297,7 +2256,7 @@ xlog_state_do_callback( xlog_state_clean_log(log); /* wake up threads waiting in xfs_log_force() */ - sv_broadcast(&iclog->ic_force_wait); + wake_up_all(&iclog->ic_force_wait); iclog = iclog->ic_next; } while (first_iclog != iclog); @@ -2344,7 +2303,7 @@ xlog_state_do_callback( spin_unlock(&log->l_icloglock); if (wake) - sv_broadcast(&log->l_flush_wait); + wake_up_all(&log->l_flush_wait); } @@ -2395,7 +2354,7 @@ xlog_state_done_syncing( * iclog buffer, we wake them all, one will get to do the * I/O, the others get to wait for the result. */ - sv_broadcast(&iclog->ic_write_wait); + wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ } /* xlog_state_done_syncing */ @@ -2444,7 +2403,7 @@ restart: XFS_STATS_INC(xs_log_noiclogs); /* Wait for log writes to have flushed */ - sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); + xlog_wait(&log->l_flush_wait, &log->l_icloglock); goto restart; } @@ -2527,6 +2486,18 @@ restart: * * Once a ticket gets put onto the reserveq, it will only return after * the needed reservation is satisfied. + * + * This function is structured so that it has a lock free fast path. This is + * necessary because every new transaction reservation will come through this + * path. Hence any lock will be globally hot if we take it unconditionally on + * every pass. + * + * As tickets are only ever moved on and off the reserveq under the + * l_grant_reserve_lock, we only need to take that lock if we are going + * to add the ticket to the queue and sleep. We can avoid taking the lock if the + * ticket was never added to the reserveq because the t_queue list head will be + * empty and we hold the only reference to it so it can safely be checked + * unlocked. */ STATIC int xlog_grant_log_space(xlog_t *log, @@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t *log, { int free_bytes; int need_bytes; -#ifdef DEBUG - xfs_lsn_t tail_lsn; -#endif - #ifdef DEBUG if (log->l_flags & XLOG_ACTIVE_RECOVERY) panic("grant Recovery problem"); #endif - /* Is there space or do we need to sleep? */ - spin_lock(&log->l_grant_lock); - trace_xfs_log_grant_enter(log, tic); + need_bytes = tic->t_unit_res; + if (tic->t_flags & XFS_LOG_PERM_RESERV) + need_bytes *= tic->t_ocnt; + /* something is already sleeping; insert new transaction at end */ - if (log->l_reserve_headq) { - xlog_ins_ticketq(&log->l_reserve_headq, tic); + if (!list_empty_careful(&log->l_reserveq)) { + spin_lock(&log->l_grant_reserve_lock); + /* recheck the queue now we are locked */ + if (list_empty(&log->l_reserveq)) { + spin_unlock(&log->l_grant_reserve_lock); + goto redo; + } + list_add_tail(&tic->t_queue, &log->l_reserveq); trace_xfs_log_grant_sleep1(log, tic); @@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t *log, goto error_return; XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); + xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + /* * If we got an error, and the filesystem is shutting down, * we'll catch it down below. So just continue... */ trace_xfs_log_grant_wake1(log, tic); - spin_lock(&log->l_grant_lock); } - if (tic->t_flags & XFS_LOG_PERM_RESERV) - need_bytes = tic->t_unit_res*tic->t_ocnt; - else - need_bytes = tic->t_unit_res; redo: if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; + goto error_return_unlocked; - free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, - log->l_grant_reserve_bytes); + free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); if (free_bytes < need_bytes) { - if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - xlog_ins_ticketq(&log->l_reserve_headq, tic); + spin_lock(&log->l_grant_reserve_lock); + if (list_empty(&tic->t_queue)) + list_add_tail(&tic->t_queue, &log->l_reserveq); trace_xfs_log_grant_sleep2(log, tic); - spin_unlock(&log->l_grant_lock); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); - - XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); - - spin_lock(&log->l_grant_lock); if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - trace_xfs_log_grant_wake2(log, tic); + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + trace_xfs_log_grant_wake2(log, tic); goto redo; - } else if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_reserve_headq, tic); + } - /* we've got enough space */ - xlog_grant_add_space(log, need_bytes); -#ifdef DEBUG - tail_lsn = log->l_tail_lsn; - /* - * Check to make sure the grant write head didn't just over lap the - * tail. If the cycles are the same, we can't be overlapping. - * Otherwise, make sure that the cycles differ by exactly one and - * check the byte count. - */ - if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { - ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); - ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); + if (!list_empty(&tic->t_queue)) { + spin_lock(&log->l_grant_reserve_lock); + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_reserve_lock); } -#endif + + /* we've got enough space */ + xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); + xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); trace_xfs_log_grant_exit(log, tic); - xlog_verify_grant_head(log, 1); - spin_unlock(&log->l_grant_lock); + xlog_verify_grant_tail(log); return 0; - error_return: - if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_reserve_headq, tic); - +error_return_unlocked: + spin_lock(&log->l_grant_reserve_lock); +error_return: + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_reserve_lock); trace_xfs_log_grant_error(log, tic); /* @@ -2638,7 +2597,6 @@ redo: */ tic->t_curr_res = 0; tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - spin_unlock(&log->l_grant_lock); return XFS_ERROR(EIO); } /* xlog_grant_log_space */ @@ -2646,17 +2604,14 @@ redo: /* * Replenish the byte reservation required by moving the grant write head. * - * + * Similar to xlog_grant_log_space, the function is structured to have a lock + * free fast path. */ STATIC int xlog_regrant_write_log_space(xlog_t *log, xlog_ticket_t *tic) { int free_bytes, need_bytes; - xlog_ticket_t *ntic; -#ifdef DEBUG - xfs_lsn_t tail_lsn; -#endif tic->t_curr_res = tic->t_unit_res; xlog_tic_reset_res(tic); @@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t *log, panic("regrant Recovery problem"); #endif - spin_lock(&log->l_grant_lock); - trace_xfs_log_regrant_write_enter(log, tic); - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; + goto error_return_unlocked; /* If there are other waiters on the queue then give them a * chance at logspace before us. Wake up the first waiters, @@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t *log, * this transaction. */ need_bytes = tic->t_unit_res; - if ((ntic = log->l_write_headq)) { - free_bytes = xlog_space_left(log, log->l_grant_write_cycle, - log->l_grant_write_bytes); - do { + if (!list_empty_careful(&log->l_writeq)) { + struct xlog_ticket *ntic; + + spin_lock(&log->l_grant_write_lock); + free_bytes = xlog_space_left(log, &log->l_grant_write_head); + list_for_each_entry(ntic, &log->l_writeq, t_queue) { ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); if (free_bytes < ntic->t_unit_res) break; free_bytes -= ntic->t_unit_res; - sv_signal(&ntic->t_wait); - ntic = ntic->t_next; - } while (ntic != log->l_write_headq); - - if (ntic != log->l_write_headq) { - if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - xlog_ins_ticketq(&log->l_write_headq, tic); + wake_up(&ntic->t_wait); + } + if (ntic != list_first_entry(&log->l_writeq, + struct xlog_ticket, t_queue)) { + if (list_empty(&tic->t_queue)) + list_add_tail(&tic->t_queue, &log->l_writeq); trace_xfs_log_regrant_write_sleep1(log, tic); - spin_unlock(&log->l_grant_lock); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); + xlog_grant_push_ail(log, need_bytes); XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_wait, PINOD|PLTWAIT, - &log->l_grant_lock, s); - - /* If we're shutting down, this tic is already - * off the queue */ - spin_lock(&log->l_grant_lock); - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - + xlog_wait(&tic->t_wait, &log->l_grant_write_lock); trace_xfs_log_regrant_write_wake1(log, tic); - } + } else + spin_unlock(&log->l_grant_write_lock); } redo: if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; + goto error_return_unlocked; - free_bytes = xlog_space_left(log, log->l_grant_write_cycle, - log->l_grant_write_bytes); + free_bytes = xlog_space_left(log, &log->l_grant_write_head); if (free_bytes < need_bytes) { - if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - xlog_ins_ticketq(&log->l_write_headq, tic); - spin_unlock(&log->l_grant_lock); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); - - XFS_STATS_INC(xs_sleep_logspace); - trace_xfs_log_regrant_write_sleep2(log, tic); - - sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); + spin_lock(&log->l_grant_write_lock); + if (list_empty(&tic->t_queue)) + list_add_tail(&tic->t_queue, &log->l_writeq); - /* If we're shutting down, this tic is already off the queue */ - spin_lock(&log->l_grant_lock); if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + trace_xfs_log_regrant_write_sleep2(log, tic); + xlog_wait(&tic->t_wait, &log->l_grant_write_lock); + trace_xfs_log_regrant_write_wake2(log, tic); goto redo; - } else if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_write_headq, tic); + } - /* we've got enough space */ - xlog_grant_add_space_write(log, need_bytes); -#ifdef DEBUG - tail_lsn = log->l_tail_lsn; - if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { - ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); - ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); + if (!list_empty(&tic->t_queue)) { + spin_lock(&log->l_grant_write_lock); + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_write_lock); } -#endif + /* we've got enough space */ + xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); trace_xfs_log_regrant_write_exit(log, tic); - - xlog_verify_grant_head(log, 1); - spin_unlock(&log->l_grant_lock); + xlog_verify_grant_tail(log); return 0; + error_return_unlocked: + spin_lock(&log->l_grant_write_lock); error_return: - if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_reserve_headq, tic); - + list_del_init(&tic->t_queue); + spin_unlock(&log->l_grant_write_lock); trace_xfs_log_regrant_write_error(log, tic); /* @@ -2778,7 +2714,6 @@ redo: */ tic->t_curr_res = 0; tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - spin_unlock(&log->l_grant_lock); return XFS_ERROR(EIO); } /* xlog_regrant_write_log_space */ @@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t *log, if (ticket->t_cnt > 0) ticket->t_cnt--; - spin_lock(&log->l_grant_lock); - xlog_grant_sub_space(log, ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_grant_reserve_head, + ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_grant_write_head, + ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); trace_xfs_log_regrant_reserve_sub(log, ticket); - xlog_verify_grant_head(log, 1); - /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) { - spin_unlock(&log->l_grant_lock); + if (ticket->t_cnt > 0) return; - } - xlog_grant_add_space_reserve(log, ticket->t_unit_res); + xlog_grant_add_space(log, &log->l_grant_reserve_head, + ticket->t_unit_res); trace_xfs_log_regrant_reserve_exit(log, ticket); - xlog_verify_grant_head(log, 0); - spin_unlock(&log->l_grant_lock); ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); } /* xlog_regrant_reserve_log_space */ @@ -2843,28 +2775,29 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket) { + int bytes; + if (ticket->t_cnt > 0) ticket->t_cnt--; - spin_lock(&log->l_grant_lock); trace_xfs_log_ungrant_enter(log, ticket); - - xlog_grant_sub_space(log, ticket->t_curr_res); - trace_xfs_log_ungrant_sub(log, ticket); - /* If this is a permanent reservation ticket, we may be able to free + /* + * If this is a permanent reservation ticket, we may be able to free * up more space based on the remaining count. */ + bytes = ticket->t_curr_res; if (ticket->t_cnt > 0) { ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); - xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); + bytes += ticket->t_unit_res*ticket->t_cnt; } + xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes); + xlog_grant_sub_space(log, &log->l_grant_write_head, bytes); + trace_xfs_log_ungrant_exit(log, ticket); - xlog_verify_grant_head(log, 1); - spin_unlock(&log->l_grant_lock); xfs_log_move_tail(log->l_mp, 1); } /* xlog_ungrant_log_space */ @@ -2901,11 +2834,11 @@ xlog_state_release_iclog( if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { /* update tail before writing to iclog */ - xlog_assign_tail_lsn(log->l_mp); + xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); sync++; iclog->ic_state = XLOG_STATE_SYNCING; - iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); - xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); /* cycle incremented when incrementing curr_block */ } spin_unlock(&log->l_icloglock); @@ -3088,7 +3021,7 @@ maybe_sleep: return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3206,8 +3139,8 @@ try_again: XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_prev->ic_write_wait, - PSWP, &log->l_icloglock, s); + xlog_wait(&iclog->ic_prev->ic_write_wait, + &log->l_icloglock); if (log_flushed) *log_flushed = 1; already_slept = 1; @@ -3235,7 +3168,7 @@ try_again: return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3310,10 +3243,8 @@ xfs_log_ticket_put( xlog_ticket_t *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); - if (atomic_dec_and_test(&ticket->t_ref)) { - sv_destroy(&ticket->t_wait); + if (atomic_dec_and_test(&ticket->t_ref)) kmem_zone_free(xfs_log_ticket_zone, ticket); - } } xlog_ticket_t * @@ -3435,6 +3366,7 @@ xlog_ticket_alloc( } atomic_set(&tic->t_ref, 1); + INIT_LIST_HEAD(&tic->t_queue); tic->t_unit_res = unit_bytes; tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; @@ -3445,7 +3377,7 @@ xlog_ticket_alloc( tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); + init_waitqueue_head(&tic->t_wait); xlog_tic_reset_res(tic); @@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr( } STATIC void -xlog_verify_grant_head(xlog_t *log, int equals) +xlog_verify_grant_tail( + struct log *log) { - if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { - if (equals) - ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); - else - ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); - } else { - ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); - ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); - } -} /* xlog_verify_grant_head */ + int tail_cycle, tail_blocks; + int cycle, space; + + /* + * Check to make sure the grant write head didn't just over lap the + * tail. If the cycles are the same, we can't be overlapping. + * Otherwise, make sure that the cycles differ by exactly one and + * check the byte count. + */ + xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); + if (tail_cycle != cycle) { + ASSERT(cycle - 1 == tail_cycle); + ASSERT(space <= BBTOB(tail_blocks)); + } +} /* check if it will fit */ STATIC void @@ -3716,12 +3655,10 @@ xfs_log_force_umount( xlog_cil_force(log); /* - * We must hold both the GRANT lock and the LOG lock, - * before we mark the filesystem SHUTDOWN and wake - * everybody up to tell the bad news. + * mark the filesystem and the as in a shutdown state and wake + * everybody up to tell them the bad news. */ spin_lock(&log->l_icloglock); - spin_lock(&log->l_grant_lock); mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; if (mp->m_sb_bp) XFS_BUF_DONE(mp->m_sb_bp); @@ -3742,27 +3679,21 @@ xfs_log_force_umount( spin_unlock(&log->l_icloglock); /* - * We don't want anybody waiting for log reservations - * after this. That means we have to wake up everybody - * queued up on reserve_headq as well as write_headq. - * In addition, we make sure in xlog_{re}grant_log_space - * that we don't enqueue anything once the SHUTDOWN flag - * is set, and this action is protected by the GRANTLOCK. + * We don't want anybody waiting for log reservations after this. That + * means we have to wake up everybody queued up on reserveq as well as + * writeq. In addition, we make sure in xlog_{re}grant_log_space that + * we don't enqueue anything once the SHUTDOWN flag is set, and this + * action is protected by the grant locks. */ - if ((tic = log->l_reserve_headq)) { - do { - sv_signal(&tic->t_wait); - tic = tic->t_next; - } while (tic != log->l_reserve_headq); - } - - if ((tic = log->l_write_headq)) { - do { - sv_signal(&tic->t_wait); - tic = tic->t_next; - } while (tic != log->l_write_headq); - } - spin_unlock(&log->l_grant_lock); + spin_lock(&log->l_grant_reserve_lock); + list_for_each_entry(tic, &log->l_reserveq, t_queue) + wake_up(&tic->t_wait); + spin_unlock(&log->l_grant_reserve_lock); + + spin_lock(&log->l_grant_write_lock); + list_for_each_entry(tic, &log->l_writeq, t_queue) + wake_up(&tic->t_wait); + spin_unlock(&log->l_grant_write_lock); if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { ASSERT(!logerror); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 916eb7db14d..3bd3291ef8d 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -191,7 +191,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket); xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); -int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, +void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_log_vec *log_vector, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 23d6ceb5e97..9ca59be0897 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -61,7 +61,7 @@ xlog_cil_init( INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); init_rwsem(&cil->xc_ctx_lock); - sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); + init_waitqueue_head(&cil->xc_commit_wait); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); @@ -361,15 +361,10 @@ xlog_cil_committed( int abort) { struct xfs_cil_ctx *ctx = args; - struct xfs_log_vec *lv; - int abortflag = abort ? XFS_LI_ABORTED : 0; struct xfs_busy_extent *busyp, *n; - /* unpin all the log items */ - for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { - xfs_trans_item_committed(lv->lv_item, ctx->start_lsn, - abortflag); - } + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, + ctx->start_lsn, abort); list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); @@ -548,7 +543,7 @@ xlog_cil_push( error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); if (error) - goto out_abort; + goto out_abort_free_ticket; /* * now that we've written the checkpoint into the log, strictly @@ -568,14 +563,15 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ - sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); goto restart; } } spin_unlock(&cil->xc_cil_lock); + /* xfs_log_done always frees the ticket on error. */ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); - if (error || commit_lsn == -1) + if (commit_lsn == -1) goto out_abort; /* attach all the transactions w/ busy extents to iclog */ @@ -592,7 +588,7 @@ restart: */ spin_lock(&cil->xc_cil_lock); ctx->commit_lsn = commit_lsn; - sv_broadcast(&cil->xc_commit_wait); + wake_up_all(&cil->xc_commit_wait); spin_unlock(&cil->xc_cil_lock); /* release the hounds! */ @@ -605,6 +601,8 @@ out_free_ticket: kmem_free(new_ctx); return 0; +out_abort_free_ticket: + xfs_log_ticket_put(tic); out_abort: xlog_cil_committed(ctx, XFS_LI_ABORTED); return XFS_ERROR(EIO); @@ -627,7 +625,7 @@ out_abort: * background commit, returns without it held once background commits are * allowed again. */ -int +void xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, @@ -642,11 +640,6 @@ xfs_log_commit_cil( if (flags & XFS_TRANS_RELEASE_LOG_RES) log_flags = XFS_LOG_REL_PERM_RESERV; - if (XLOG_FORCED_SHUTDOWN(log)) { - xlog_cil_free_logvec(log_vector); - return XFS_ERROR(EIO); - } - /* * do all the hard work of formatting items (including memory * allocation) outside the CIL context lock. This prevents stalling CIL @@ -706,7 +699,6 @@ xfs_log_commit_cil( */ if (push) xlog_cil_push(log, 0); - return 0; } /* @@ -757,7 +749,7 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ - sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); goto restart; } if (ctx->sequence != sequence) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index edcdfe01617..d5f8be8f4bf 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -21,7 +21,6 @@ struct xfs_buf; struct log; struct xlog_ticket; -struct xfs_buf_cancel; struct xfs_mount; /* @@ -54,7 +53,6 @@ struct xfs_mount; BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) - static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) { return ((xfs_lsn_t)cycle << 32) | block; @@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i) */ #define XLOG_TIC_INITED 0x1 /* has been initialized */ #define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ -#define XLOG_TIC_IN_Q 0x4 #define XLOG_TIC_FLAGS \ { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ - { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ - { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" } + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } #endif /* __KERNEL__ */ @@ -244,9 +240,8 @@ typedef struct xlog_res { } xlog_res_t; typedef struct xlog_ticket { - sv_t t_wait; /* ticket wait queue : 20 */ - struct xlog_ticket *t_next; /* :4|8 */ - struct xlog_ticket *t_prev; /* :4|8 */ + wait_queue_head_t t_wait; /* ticket wait queue */ + struct list_head t_queue; /* reserve/write queue */ xlog_tid_t t_tid; /* transaction identifier : 4 */ atomic_t t_ref; /* ticket reference count : 4 */ int t_curr_res; /* current reservation in bytes : 4 */ @@ -353,8 +348,8 @@ typedef union xlog_in_core2 { * and move everything else out to subsequent cachelines. */ typedef struct xlog_in_core { - sv_t ic_force_wait; - sv_t ic_write_wait; + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; @@ -421,7 +416,7 @@ struct xfs_cil { struct xfs_cil_ctx *xc_ctx; struct rw_semaphore xc_ctx_lock; struct list_head xc_committing; - sv_t xc_commit_wait; + wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; }; @@ -491,7 +486,7 @@ typedef struct log { struct xfs_buftarg *l_targ; /* buftarg of log */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ - struct xfs_buf_cancel **l_buf_cancel_table; + struct list_head *l_buf_cancel_table; int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ @@ -503,29 +498,40 @@ typedef struct log { int l_logBBsize; /* size of log in BB chunks */ /* The following block of fields are changed while holding icloglock */ - sv_t l_flush_wait ____cacheline_aligned_in_smp; + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; /* waiting for iclog flush */ int l_covered_state;/* state of "covering disk * log entries" */ xlog_in_core_t *l_iclog; /* head log queue */ spinlock_t l_icloglock; /* grab to change iclog state */ - xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed - * buffers */ - xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last * block increment */ int l_curr_block; /* current logical log block */ int l_prev_block; /* previous logical log block */ - /* The following block of fields are changed while holding grant_lock */ - spinlock_t l_grant_lock ____cacheline_aligned_in_smp; - xlog_ticket_t *l_reserve_headq; - xlog_ticket_t *l_write_headq; - int l_grant_reserve_cycle; - int l_grant_reserve_bytes; - int l_grant_write_cycle; - int l_grant_write_bytes; + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + /* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ + spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp; + struct list_head l_reserveq; + atomic64_t l_grant_reserve_head; + + spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp; + struct list_head l_writeq; + atomic64_t l_grant_write_head; /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG @@ -534,6 +540,9 @@ typedef struct log { } xlog_t; +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) /* common routines */ @@ -562,6 +571,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector, xlog_in_core_t **commit_iclog, uint flags); /* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to smaple and crack LSNs taht are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* * Committed Item List interfaces */ int xlog_cil_init(struct log *log); @@ -585,6 +649,21 @@ xlog_cil_force(struct log *log) */ #define XLOG_UNMOUNT_REC_TYPE (-1U) +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} #endif /* __KERNEL__ */ #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 966d3f97458..aa0ebb77690 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -53,6 +53,17 @@ STATIC void xlog_recover_check_summary(xlog_t *); #endif /* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; + +/* * Sector aligned buffer routines for buffer create/read/write/access */ @@ -925,12 +936,12 @@ xlog_find_tail( log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); if (found == 2) log->l_curr_cycle++; - log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); - log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); - log->l_grant_reserve_cycle = log->l_curr_cycle; - log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); - log->l_grant_write_cycle = log->l_curr_cycle; - log->l_grant_write_bytes = BBTOB(log->l_curr_block); + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle, + BBTOB(log->l_curr_block)); /* * Look for unmount record. If we find it, then we know there @@ -960,7 +971,7 @@ xlog_find_tail( } after_umount_blk = (i + hblks + (int) BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; - tail_lsn = log->l_tail_lsn; + tail_lsn = atomic64_read(&log->l_tail_lsn); if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = (i + hblks) % log->l_logBBsize; @@ -975,12 +986,10 @@ xlog_find_tail( * log records will point recovery to after the * current unmount record. */ - log->l_tail_lsn = - xlog_assign_lsn(log->l_curr_cycle, - after_umount_blk); - log->l_last_sync_lsn = - xlog_assign_lsn(log->l_curr_cycle, - after_umount_blk); + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); *tail_blk = after_umount_blk; /* @@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans( * record in the table to tell us how many times we expect to see this * record during the second pass. */ -STATIC void -xlog_recover_do_buffer_pass1( - xlog_t *log, - xfs_buf_log_format_t *buf_f) +STATIC int +xlog_recover_buffer_pass1( + struct log *log, + xlog_recover_item_t *item) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *nextp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; - xfs_daddr_t blkno = 0; - uint len = 0; - ushort flags = 0; - - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - } + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLF_CANCEL)) { + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { trace_xfs_log_recover_buf_not_cancel(log, buf_f); - return; - } - - /* - * Insert an xfs_buf_cancel record into the hash table of - * them. If there is already an identical record, bump - * its reference count. - */ - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - /* - * If the hash bucket is empty then just insert a new record into - * the bucket. - */ - if (*bucket == NULL) { - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; - bcp->bc_refcount = 1; - bcp->bc_next = NULL; - *bucket = bcp; - return; + return 0; } /* - * The hash bucket is not empty, so search for duplicates of our - * record. If we find one them just bump its refcount. If not - * then add us at the end of the list. + * Insert an xfs_buf_cancel record into the hash table of them. + * If there is already an identical record, bump its reference count. */ - prevp = NULL; - nextp = *bucket; - while (nextp != NULL) { - if (nextp->bc_blkno == blkno && nextp->bc_len == len) { - nextp->bc_refcount++; + bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == buf_f->blf_blkno && + bcp->bc_len == buf_f->blf_len) { + bcp->bc_refcount++; trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); - return; + return 0; } - prevp = nextp; - nextp = nextp->bc_next; - } - ASSERT(prevp != NULL); - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; + } + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp->bc_blkno = buf_f->blf_blkno; + bcp->bc_len = buf_f->blf_len; bcp->bc_refcount = 1; - bcp->bc_next = NULL; - prevp->bc_next = bcp; + list_add_tail(&bcp->bc_list, bucket); + trace_xfs_log_recover_buf_cancel_add(log, buf_f); + return 0; } /* @@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1( */ STATIC int xlog_check_buffer_cancelled( - xlog_t *log, + struct log *log, xfs_daddr_t blkno, uint len, ushort flags) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; if (log->l_buf_cancel_table == NULL) { /* @@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled( return 0; } - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - bcp = *bucket; - if (bcp == NULL) { - /* - * There is no corresponding entry in the table built - * in pass one, so this buffer has not been cancelled. - */ - ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; - } - /* - * Search for an entry in the buffer cancel table that - * matches our buffer. + * Search for an entry in the cancel table that matches our buffer. */ - prevp = NULL; - while (bcp != NULL) { - if (bcp->bc_blkno == blkno && bcp->bc_len == len) { - /* - * We've go a match, so return 1 so that the - * recovery of this buffer is cancelled. - * If this buffer is actually a buffer cancel - * log item, then decrement the refcount on the - * one in the table and remove it if this is the - * last reference. - */ - if (flags & XFS_BLF_CANCEL) { - bcp->bc_refcount--; - if (bcp->bc_refcount == 0) { - if (prevp == NULL) { - *bucket = bcp->bc_next; - } else { - prevp->bc_next = bcp->bc_next; - } - kmem_free(bcp); - } - } - return 1; - } - prevp = bcp; - bcp = bcp->bc_next; + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + goto found; } + /* - * We didn't find a corresponding entry in the table, so - * return 0 so that the buffer is NOT cancelled. + * We didn't find a corresponding entry in the table, so return 0 so + * that the buffer is NOT cancelled. */ ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; -} -STATIC int -xlog_recover_do_buffer_pass2( - xlog_t *log, - xfs_buf_log_format_t *buf_f) -{ - xfs_daddr_t blkno = 0; - ushort flags = 0; - uint len = 0; - - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - flags = buf_f->blf_flags; - len = buf_f->blf_len; - break; +found: + /* + * We've go a match, so return 1 so that the recovery of this buffer + * is cancelled. If this buffer is actually a buffer cancel log + * item, then decrement the refcount on the one in the table and + * remove it if this is the last reference. + */ + if (flags & XFS_BLF_CANCEL) { + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } } - - return xlog_check_buffer_cancelled(log, blkno, len, flags); + return 1; } /* - * Perform recovery for a buffer full of inodes. In these buffers, - * the only data which should be recovered is that which corresponds - * to the di_next_unlinked pointers in the on disk inode structures. - * The rest of the data for the inodes is always logged through the - * inodes themselves rather than the inode buffer and is recovered - * in xlog_recover_do_inode_trans(). + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). * - * The only time when buffers full of inodes are fully recovered is - * when the buffer is full of newly allocated inodes. In this case - * the buffer will not be marked as an inode buffer and so will be - * sent to xlog_recover_do_reg_buffer() below during recovery. + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. */ STATIC int xlog_recover_do_inode_buffer( - xfs_mount_t *mp, + struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; - int item_index; - int bit; - int nbits; - int reg_buf_offset; - int reg_buf_bytes; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; int next_unlinked_offset; int inodes_per_buf; xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; - unsigned int *data_map = NULL; - unsigned int map_size = 0; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - } - /* - * Set the variables corresponding to the current region to - * 0 so that we'll initialize them on the first pass through - * the loop. - */ - reg_buf_offset = 0; - reg_buf_bytes = 0; - bit = 0; - nbits = 0; - item_index = 0; inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + @@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer( * the current di_next_unlinked field. */ bit += nbits; - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); /* * If there are no more logged regions in the * buffer, then we're done. */ - if (bit == -1) { + if (bit == -1) return 0; - } - nbits = xfs_contig_bits(data_map, map_size, - bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); reg_buf_offset = bit << XFS_BLF_SHIFT; reg_buf_bytes = nbits << XFS_BLF_SHIFT; @@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer( * di_next_unlinked field, then move on to the next * di_next_unlinked field. */ - if (next_unlinked_offset < reg_buf_offset) { + if (next_unlinked_offset < reg_buf_offset) continue; - } ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); @@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer( * given buffer. The bitmap in the buf log format structure indicates * where to place the logged data. */ -/*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; int bit; int nbits; - unsigned int *data_map = NULL; - unsigned int map_size = 0; int error; trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - } bit = 0; i = 1; /* 0 is the buf format structure */ while (1) { - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); if (bit == -1) break; - nbits = xfs_contig_bits(data_map, map_size, bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); @@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer( * for more details on the implementation of the table of cancel records. */ STATIC int -xlog_recover_do_buffer_trans( +xlog_recover_buffer_pass2( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; int error; - int cancel; - xfs_daddr_t blkno; - int len; - ushort flags; uint buf_flags; - if (pass == XLOG_RECOVER_PASS1) { - /* - * In this pass we're only looking for buf items - * with the XFS_BLF_CANCEL bit set. - */ - xlog_recover_do_buffer_pass1(log, buf_f); + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; - } else { - /* - * In this pass we want to recover all the buffers - * which have not been cancelled and are not - * cancellation buffers themselves. The routine - * we call here will tell us whether or not to - * continue with the replay of this buffer. - */ - cancel = xlog_recover_do_buffer_pass2(log, buf_f); - if (cancel) { - trace_xfs_log_recover_buf_cancel(log, buf_f); - return 0; - } } + trace_xfs_log_recover_buf_recover(log, buf_f); - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - default: - xfs_fs_cmn_err(CE_ALERT, log->l_mp, - "xfs_log_recover: unknown buffer type 0x%x, logdev %s", - buf_f->blf_type, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); - XFS_ERROR_REPORT("xlog_recover_do_buffer_trans", - XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); - } - mp = log->l_mp; buf_flags = XBF_LOCK; - if (!(flags & XFS_BLF_INODE_BUF)) + if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF)) buf_flags |= XBF_MAPPED; - bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); + bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags); if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, - bp, blkno); + xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, + bp, buf_f->blf_blkno); error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); return error; } error = 0; - if (flags & XFS_BLF_INODE_BUF) { + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - } else if (flags & + } else if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { @@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans( } STATIC int -xlog_recover_do_inode_trans( +xlog_recover_inode_pass2( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { xfs_inode_log_format_t *in_f; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; xfs_dinode_t *dip; - xfs_ino_t ino; int len; xfs_caddr_t src; xfs_caddr_t dest; @@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans( xfs_icdinode_t *dicp; int need_free = 0; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { in_f = item->ri_buf[0].i_addr; } else { @@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans( if (error) goto error; } - ino = in_f->ilf_ino; - mp = log->l_mp; /* * Inode buffers can be freed, look out for it, @@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans( xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", - dip, bp, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", + dip, bp, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; goto error; @@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans( xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", - item, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", + item, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; goto error; @@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans( if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; goto error; } @@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans( if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; goto error; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", - item, dip, bp, ino, + item, dip, bp, in_f->ilf_ino, dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; goto error; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", - item, dip, bp, ino, dicp->di_forkoff); + item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; goto error; } if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, @@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans( break; default: - xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); + xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag"); ASSERT(0); xfs_buf_relse(bp); error = EIO; @@ -2556,18 +2422,11 @@ error: * of that type. */ STATIC int -xlog_recover_do_quotaoff_trans( +xlog_recover_quotaoff_pass1( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { - xfs_qoff_logformat_t *qoff_f; - - if (pass == XLOG_RECOVER_PASS2) { - return (0); - } - - qoff_f = item->ri_buf[0].i_addr; + xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); /* @@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans( * Recover a dquot record */ STATIC int -xlog_recover_do_dquot_trans( +xlog_recover_dquot_pass2( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; struct xfs_disk_dquot *ddq, *recddq; int error; xfs_dq_logformat_t *dq_f; uint type; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - mp = log->l_mp; /* * Filesystems are required to send in quota flags at mount time. @@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans( if ((error = xfs_qm_dqcheck(recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans (log copy)"))) { + "xlog_recover_dquot_pass2 (log copy)"))) { return XFS_ERROR(EIO); } ASSERT(dq_f->qlf_len == 1); @@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans( * minimal initialization then. */ if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans")) { + "xlog_recover_dquot_pass2")) { xfs_buf_relse(bp); return XFS_ERROR(EIO); } @@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans( * LSN. */ STATIC int -xlog_recover_do_efi_trans( +xlog_recover_efi_pass2( xlog_t *log, xlog_recover_item_t *item, - xfs_lsn_t lsn, - int pass) + xfs_lsn_t lsn) { int error; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_efi_log_item_t *efip; xfs_efi_log_format_t *efi_formatp; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - efi_formatp = item->ri_buf[0].i_addr; - mp = log->l_mp; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), &(efip->efi_format)))) { xfs_efi_item_free(efip); return error; } - efip->efi_next_extent = efi_formatp->efi_nextents; - efip->efi_flags |= XFS_EFI_COMMITTED; + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); spin_lock(&log->l_ailp->xa_lock); /* * xfs_trans_ail_update() drops the AIL lock. */ - xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); + xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); return 0; } @@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans( * efd format structure. If we find it, we remove the efi from the * AIL and free it. */ -STATIC void -xlog_recover_do_efd_trans( +STATIC int +xlog_recover_efd_pass2( xlog_t *log, - xlog_recover_item_t *item, - int pass) + xlog_recover_item_t *item) { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; @@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans( struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; - if (pass == XLOG_RECOVER_PASS1) { - return; - } - efd_formatp = item->ri_buf[0].i_addr; ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || @@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans( } xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); -} - -/* - * Perform the transaction - * - * If the transaction modifies a buffer or inode, do it now. Otherwise, - * EFIs and EFDs get queued up by adding entries into the AIL for them. - */ -STATIC int -xlog_recover_do_trans( - xlog_t *log, - xlog_recover_t *trans, - int pass) -{ - int error = 0; - xlog_recover_item_t *item; - - error = xlog_recover_reorder_trans(log, trans, pass); - if (error) - return error; - - list_for_each_entry(item, &trans->r_itemq, ri_list) { - trace_xfs_log_recover_item_recover(log, trans, item, pass); - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - error = xlog_recover_do_buffer_trans(log, item, pass); - break; - case XFS_LI_INODE: - error = xlog_recover_do_inode_trans(log, item, pass); - break; - case XFS_LI_EFI: - error = xlog_recover_do_efi_trans(log, item, - trans->r_lsn, pass); - break; - case XFS_LI_EFD: - xlog_recover_do_efd_trans(log, item, pass); - error = 0; - break; - case XFS_LI_DQUOT: - error = xlog_recover_do_dquot_trans(log, item, pass); - break; - case XFS_LI_QUOTAOFF: - error = xlog_recover_do_quotaoff_trans(log, item, - pass); - break; - default: - xlog_warn( - "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item)); - ASSERT(0); - error = XFS_ERROR(EIO); - break; - } - - if (error) - return error; - } return 0; } @@ -2852,7 +2638,7 @@ xlog_recover_do_trans( */ STATIC void xlog_recover_free_trans( - xlog_recover_t *trans) + struct xlog_recover *trans) { xlog_recover_item_t *item, *n; int i; @@ -2871,17 +2657,95 @@ xlog_recover_free_trans( } STATIC int +xlog_recover_commit_pass1( + struct log *log, + struct xlog_recover *trans, + xlog_recover_item_t *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass1(log, item); + case XFS_LI_QUOTAOFF: + return xlog_recover_quotaoff_pass1(log, item); + case XFS_LI_INODE: + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_DQUOT: + /* nothing to do in pass 1 */ + return 0; + default: + xlog_warn( + "XFS: invalid item type (%d) xlog_recover_commit_pass1", + ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +STATIC int +xlog_recover_commit_pass2( + struct log *log, + struct xlog_recover *trans, + xlog_recover_item_t *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass2(log, item); + case XFS_LI_INODE: + return xlog_recover_inode_pass2(log, item); + case XFS_LI_EFI: + return xlog_recover_efi_pass2(log, item, trans->r_lsn); + case XFS_LI_EFD: + return xlog_recover_efd_pass2(log, item); + case XFS_LI_DQUOT: + return xlog_recover_dquot_pass2(log, item); + case XFS_LI_QUOTAOFF: + /* nothing to do in pass2 */ + return 0; + default: + xlog_warn( + "XFS: invalid item type (%d) xlog_recover_commit_pass2", + ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +/* + * Perform the transaction. + * + * If the transaction modifies a buffer or inode, do it now. Otherwise, + * EFIs and EFDs get queued up by adding entries into the AIL for them. + */ +STATIC int xlog_recover_commit_trans( - xlog_t *log, - xlog_recover_t *trans, + struct log *log, + struct xlog_recover *trans, int pass) { - int error; + int error = 0; + xlog_recover_item_t *item; hlist_del(&trans->r_list); - if ((error = xlog_recover_do_trans(log, trans, pass))) + + error = xlog_recover_reorder_trans(log, trans, pass); + if (error) return error; - xlog_recover_free_trans(trans); /* no error */ + + list_for_each_entry(item, &trans->r_itemq, ri_list) { + if (pass == XLOG_RECOVER_PASS1) + error = xlog_recover_commit_pass1(log, trans, item); + else + error = xlog_recover_commit_pass2(log, trans, item); + if (error) + return error; + } + + xlog_recover_free_trans(trans); return 0; } @@ -3011,7 +2875,7 @@ xlog_recover_process_efi( xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; - ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); /* * First check the validity of the extents described by the @@ -3050,7 +2914,7 @@ xlog_recover_process_efi( extp->ext_len); } - efip->efi_flags |= XFS_EFI_RECOVERED; + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); error = xfs_trans_commit(tp, 0); return error; @@ -3107,7 +2971,7 @@ xlog_recover_process_efis( * Skip EFIs that we've already processed. */ efip = (xfs_efi_log_item_t *)lip; - if (efip->efi_flags & XFS_EFI_RECOVERED) { + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; } @@ -3724,7 +3588,7 @@ xlog_do_log_recovery( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error; + int error, i; ASSERT(head_blk != tail_blk); @@ -3732,10 +3596,12 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = - (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(xfs_buf_cancel_t*), + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), KM_SLEEP); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1); if (error != 0) { @@ -3754,7 +3620,7 @@ xlog_do_log_recovery( int i; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(log->l_buf_cancel_table[i] == NULL); + ASSERT(list_empty(&log->l_buf_cancel_table[i])); } #endif /* DEBUG */ @@ -3934,7 +3800,7 @@ xlog_recover_finish( log->l_flags &= ~XLOG_RECOVERY_NEEDED; } else { cmn_err(CE_DEBUG, - "!Ending clean XFS mount for filesystem: %s\n", + "Ending clean XFS mount for filesystem: %s\n", log->l_mp->m_fsname); } return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 19e9dfa1c25..d447aef84bc 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -472,7 +472,7 @@ xfs_initialize_perag( goto out_unwind; pag->pag_agno = index; pag->pag_mount = mp; - rwlock_init(&pag->pag_ici_lock); + spin_lock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); spin_lock_init(&pag->pag_buf_lock); @@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp) } /* + * precalculate the low space thresholds for dynamic speculative preallocation. + */ +void +xfs_set_low_space_thresholds( + struct xfs_mount *mp) +{ + int i; + + for (i = 0; i < XFS_LOWSP_MAX; i++) { + __uint64_t space = mp->m_sb.sb_dblocks; + + do_div(space, 100); + mp->m_low_space[i] = space * (i + 1); + } +} + + +/* * Set whether we're using inode alignment. */ STATIC void @@ -1196,6 +1214,9 @@ xfs_mountfs( */ xfs_set_rw_sizes(mp); + /* set the low space thresholds for dynamic preallocation */ + xfs_set_low_space_thresholds(mp); + /* * Set the inode cluster size. * This may still be overridden by the file system diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 5861b498074..a62e8971539 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, xfs_mod_incore_sb(mp, field, delta, rsvd) #endif +/* dynamic preallocation free space thresholds, 5% down to 1% */ +enum { + XFS_LOWSP_1_PCNT = 0, + XFS_LOWSP_2_PCNT, + XFS_LOWSP_3_PCNT, + XFS_LOWSP_4_PCNT, + XFS_LOWSP_5_PCNT, + XFS_LOWSP_MAX, +}; + typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ @@ -202,6 +212,8 @@ typedef struct xfs_mount { __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ + int64_t m_low_space[XFS_LOWSP_MAX]; + /* low free space thresholds */ } xfs_mount_t; /* @@ -379,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); +extern void xfs_set_low_space_thresholds(struct xfs_mount *); + #endif /* __KERNEL__ */ extern void xfs_mod_sb(struct xfs_trans *, __int64_t); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f6d956b7711..76922793f64 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1137,7 +1137,7 @@ out_undo_fdblocks: if (blkdelta) xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); out: - ASSERT(error = 0); + ASSERT(error == 0); return; } @@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs( * they could be immediately flushed and we'd have to race with the flusher * trying to pull the item from the AIL as we add it. */ -void +static void xfs_trans_item_committed( struct xfs_log_item *lip, xfs_lsn_t commit_lsn, @@ -1425,21 +1425,120 @@ xfs_trans_committed( xfs_trans_free(tp); } +static inline void +xfs_log_item_batch_insert( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t commit_lsn) +{ + int i; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ + xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn); + + for (i = 0; i < nr_items; i++) + IOP_UNPIN(log_items[i], 0); +} + /* - * Called from the trans_commit code when we notice that - * the filesystem is in the middle of a forced shutdown. + * Bulk operation version of xfs_trans_committed that takes a log vector of + * items to insert into the AIL. This uses bulk AIL insertion techniques to + * minimise lock traffic. + * + * If we are called with the aborted flag set, it is because a log write during + * a CIL checkpoint commit has failed. In this case, all the items in the + * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which + * means that checkpoint commit abort handling is treated exactly the same + * as an iclog write error even though we haven't started any IO yet. Hence in + * this case all we need to do is IOP_COMMITTED processing, followed by an + * IOP_UNPIN(aborted) call. + */ +void +xfs_trans_committed_bulk( + struct xfs_ail *ailp, + struct xfs_log_vec *log_vector, + xfs_lsn_t commit_lsn, + int aborted) +{ +#define LOG_ITEM_BATCH_SIZE 32 + struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; + struct xfs_log_vec *lv; + int i = 0; + + /* unpin all the log items */ + for (lv = log_vector; lv; lv = lv->lv_next ) { + struct xfs_log_item *lip = lv->lv_item; + xfs_lsn_t item_lsn; + + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = IOP_COMMITTED(lip, commit_lsn); + + /* item_lsn of -1 means the item was freed */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + continue; + + /* + * if we are aborting the operation, no point in inserting the + * object into the AIL as we are in a shutdown situation. + */ + if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); + IOP_UNPIN(lip, 1); + continue; + } + + if (item_lsn != commit_lsn) { + + /* + * Not a bulk update option due to unusual item_lsn. + * Push into AIL immediately, rechecking the lsn once + * we have the ail lock. Then unpin the item. + */ + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) + xfs_trans_ail_update(ailp, lip, item_lsn); + else + spin_unlock(&ailp->xa_lock); + IOP_UNPIN(lip, 0); + continue; + } + + /* Item is a candidate for bulk AIL insert. */ + log_items[i++] = lv->lv_item; + if (i >= LOG_ITEM_BATCH_SIZE) { + xfs_log_item_batch_insert(ailp, log_items, + LOG_ITEM_BATCH_SIZE, commit_lsn); + i = 0; + } + } + + /* make sure we insert the remainder! */ + if (i) + xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn); +} + +/* + * Called from the trans_commit code when we notice that the filesystem is in + * the middle of a forced shutdown. + * + * When we are called here, we have already pinned all the items in the + * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called + * so we can simply walk the items in the transaction, unpin them with an abort + * flag and then free the items. Note that unpinning the items can result in + * them being freed immediately, so we need to use a safe list traversal method + * here. */ STATIC void xfs_trans_uncommit( struct xfs_trans *tp, uint flags) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item_desc *lidp, *n; - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* - * Unpin all but those that aren't dirty. - */ + list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) { if (lidp->lid_flags & XFS_LID_DIRTY) IOP_UNPIN(lidp->lid_item, 1); } @@ -1656,7 +1755,6 @@ xfs_trans_commit_cil( int flags) { struct xfs_log_vec *log_vector; - int error; /* * Get each log item to allocate a vector structure for @@ -1667,9 +1765,7 @@ xfs_trans_commit_cil( if (!log_vector) return ENOMEM; - error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); - if (error) - return error; + xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 246286b77a8..c2042b736b8 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -294,8 +294,8 @@ struct xfs_log_item_desc { #define XFS_ALLOC_BTREE_REF 2 #define XFS_BMAP_BTREE_REF 2 #define XFS_DIR_BTREE_REF 2 +#define XFS_INO_REF 2 #define XFS_ATTR_BTREE_REF 1 -#define XFS_INO_REF 1 #define XFS_DQUOT_REF 1 #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index dc9069568ff..c5bbbc45db9 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -28,8 +28,8 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" -STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); +STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t); +STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *); STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *); @@ -449,129 +449,152 @@ xfs_trans_unlocked_item( xfs_log_move_tail(ailp->xa_mount, 1); } /* xfs_trans_unlocked_item */ - /* - * Update the position of the item in the AIL with the new - * lsn. If it is not yet in the AIL, add it. Otherwise, move - * it to its new position by removing it and re-adding it. + * xfs_trans_ail_update - bulk AIL insertion operation. + * + * @xfs_trans_ail_update takes an array of log items that all need to be + * positioned at the same LSN in the AIL. If an item is not in the AIL, it will + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. If we move the first item in the AIL, update the log tail to + * match the new minimum LSN in the AIL. * - * Wakeup anyone with an lsn less than the item's lsn. If the item - * we move in the AIL is the minimum one, update the tail lsn in the - * log manager. + * This function takes the AIL lock once to execute the update operations on + * all the items in the array, and as such should not be called with the AIL + * lock held. As a result, once we have the AIL lock, we need to check each log + * item LSN to confirm it needs to be moved forward in the AIL. * - * This function must be called with the AIL lock held. The lock - * is dropped before returning. + * To optimise the insert operation, we delete all the items from the AIL in + * the first pass, moving them into a temporary list, then splice the temporary + * list into the correct position in the AIL. This avoids needing to do an + * insert operation on every item. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. */ void -xfs_trans_ail_update( - struct xfs_ail *ailp, - xfs_log_item_t *lip, - xfs_lsn_t lsn) __releases(ailp->xa_lock) +xfs_trans_ail_update_bulk( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock) { - xfs_log_item_t *dlip = NULL; - xfs_log_item_t *mlip; /* ptr to minimum lip */ + xfs_log_item_t *mlip; xfs_lsn_t tail_lsn; + int mlip_changed = 0; + int i; + LIST_HEAD(tmp); mlip = xfs_ail_min(ailp); - if (lip->li_flags & XFS_LI_IN_AIL) { - dlip = xfs_ail_delete(ailp, lip); - ASSERT(dlip == lip); - xfs_trans_ail_cursor_clear(ailp, dlip); - } else { - lip->li_flags |= XFS_LI_IN_AIL; + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (lip->li_flags & XFS_LI_IN_AIL) { + /* check if we really need to move the item */ + if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) + continue; + + xfs_ail_delete(ailp, lip); + if (mlip == lip) + mlip_changed = 1; + } else { + lip->li_flags |= XFS_LI_IN_AIL; + } + lip->li_lsn = lsn; + list_add(&lip->li_ail, &tmp); } - lip->li_lsn = lsn; - xfs_ail_insert(ailp, lip); + xfs_ail_splice(ailp, &tmp, lsn); - if (mlip == dlip) { - mlip = xfs_ail_min(ailp); - /* - * It is not safe to access mlip after the AIL lock is - * dropped, so we must get a copy of li_lsn before we do - * so. This is especially important on 32-bit platforms - * where accessing and updating 64-bit values like li_lsn - * is not atomic. - */ - tail_lsn = mlip->li_lsn; - spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, tail_lsn); - } else { + if (!mlip_changed) { spin_unlock(&ailp->xa_lock); + return; } - -} /* xfs_trans_update_ail */ + /* + * It is not safe to access mlip after the AIL lock is dropped, so we + * must get a copy of li_lsn before we do so. This is especially + * important on 32-bit platforms where accessing and updating 64-bit + * values like li_lsn is not atomic. + */ + mlip = xfs_ail_min(ailp); + tail_lsn = mlip->li_lsn; + spin_unlock(&ailp->xa_lock); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); +} /* - * Delete the given item from the AIL. It must already be in - * the AIL. + * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL * - * Wakeup anyone with an lsn less than item's lsn. If the item - * we delete in the AIL is the minimum one, update the tail lsn in the - * log manager. + * @xfs_trans_ail_delete_bulk takes an array of log items that all need to + * removed from the AIL. The caller is already holding the AIL lock, and done + * all the checks necessary to ensure the items passed in via @log_items are + * ready for deletion. This includes checking that the items are in the AIL. * - * Clear the IN_AIL flag from the item, reset its lsn to 0, and - * bump the AIL's generation count to indicate that the tree - * has changed. + * For each log item to be removed, unlink it from the AIL, clear the IN_AIL + * flag from the item and reset the item's lsn to 0. If we remove the first + * item in the AIL, update the log tail to match the new minimum LSN in the + * AIL. * - * This function must be called with the AIL lock held. The lock - * is dropped before returning. + * This function will not drop the AIL lock until all items are removed from + * the AIL to minimise the amount of lock traffic on the AIL. This does not + * greatly increase the AIL hold time, but does significantly reduce the amount + * of traffic on the lock, especially during IO completion. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. */ void -xfs_trans_ail_delete( - struct xfs_ail *ailp, - xfs_log_item_t *lip) __releases(ailp->xa_lock) +xfs_trans_ail_delete_bulk( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items) __releases(ailp->xa_lock) { - xfs_log_item_t *dlip; xfs_log_item_t *mlip; xfs_lsn_t tail_lsn; + int mlip_changed = 0; + int i; - if (lip->li_flags & XFS_LI_IN_AIL) { - mlip = xfs_ail_min(ailp); - dlip = xfs_ail_delete(ailp, lip); - ASSERT(dlip == lip); - xfs_trans_ail_cursor_clear(ailp, dlip); - + mlip = xfs_ail_min(ailp); - lip->li_flags &= ~XFS_LI_IN_AIL; - lip->li_lsn = 0; + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + struct xfs_mount *mp = ailp->xa_mount; - if (mlip == dlip) { - mlip = xfs_ail_min(ailp); - /* - * It is not safe to access mlip after the AIL lock - * is dropped, so we must get a copy of li_lsn - * before we do so. This is especially important - * on 32-bit platforms where accessing and updating - * 64-bit values like li_lsn is not atomic. - */ - tail_lsn = mlip ? mlip->li_lsn : 0; - spin_unlock(&ailp->xa_lock); - xfs_log_move_tail(ailp->xa_mount, tail_lsn); - } else { spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + return; } + + xfs_ail_delete(ailp, lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + if (mlip == lip) + mlip_changed = 1; } - else { - /* - * If the file system is not being shutdown, we are in - * serious trouble if we get to this stage. - */ - struct xfs_mount *mp = ailp->xa_mount; + if (!mlip_changed) { spin_unlock(&ailp->xa_lock); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, - "%s: attempting to delete a log item that is not in the AIL", - __func__); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } + return; } -} - + /* + * It is not safe to access mlip after the AIL lock is dropped, so we + * must get a copy of li_lsn before we do so. This is especially + * important on 32-bit platforms where accessing and updating 64-bit + * values like li_lsn is not atomic. It is possible we've emptied the + * AIL here, so if that is the case, pass an LSN of 0 to the tail move. + */ + mlip = xfs_ail_min(ailp); + tail_lsn = mlip ? mlip->li_lsn : 0; + spin_unlock(&ailp->xa_lock); + xfs_log_move_tail(ailp->xa_mount, tail_lsn); +} /* * The active item list (AIL) is a doubly linked list of log @@ -623,16 +646,13 @@ xfs_trans_ail_destroy( } /* - * Insert the given log item into the AIL. - * We almost always insert at the end of the list, so on inserts - * we search from the end of the list to find where the - * new item belongs. + * splice the log item list into the AIL at the given LSN. */ STATIC void -xfs_ail_insert( +xfs_ail_splice( struct xfs_ail *ailp, - xfs_log_item_t *lip) -/* ARGSUSED */ + struct list_head *list, + xfs_lsn_t lsn) { xfs_log_item_t *next_lip; @@ -640,39 +660,33 @@ xfs_ail_insert( * If the list is empty, just insert the item. */ if (list_empty(&ailp->xa_ail)) { - list_add(&lip->li_ail, &ailp->xa_ail); + list_splice(list, &ailp->xa_ail); return; } list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { - if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0) + if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0) break; } ASSERT((&next_lip->li_ail == &ailp->xa_ail) || - (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); - - list_add(&lip->li_ail, &next_lip->li_ail); + (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)); - xfs_ail_check(ailp, lip); + list_splice_init(list, &next_lip->li_ail); return; } /* * Delete the given item from the AIL. Return a pointer to the item. */ -/*ARGSUSED*/ -STATIC xfs_log_item_t * +STATIC void xfs_ail_delete( struct xfs_ail *ailp, xfs_log_item_t *lip) -/* ARGSUSED */ { xfs_ail_check(ailp, lip); - list_del(&lip->li_ail); - - return lip; + xfs_trans_ail_cursor_clear(ailp, lip); } /* @@ -682,7 +696,6 @@ xfs_ail_delete( STATIC xfs_log_item_t * xfs_ail_min( struct xfs_ail *ailp) -/* ARGSUSED */ { if (list_empty(&ailp->xa_ail)) return NULL; @@ -699,7 +712,6 @@ STATIC xfs_log_item_t * xfs_ail_next( struct xfs_ail *ailp, xfs_log_item_t *lip) -/* ARGSUSED */ { if (lip->li_ail.next == &ailp->xa_ail) return NULL; diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index f783d5e9fa7..f7590f5bade 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp, tp->t_flags |= XFS_TRANS_DIRTY; efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; - next_extent = efip->efi_next_extent; + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; ASSERT(next_extent < efip->efi_format.efi_nextents); extp = &(efip->efi_format.efi_extents[next_extent]); extp->ext_start = start_block; extp->ext_len = ext_len; - efip->efi_next_extent++; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 62da86c90de..35162c238fa 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -22,15 +22,17 @@ struct xfs_log_item; struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, int flags); -void xfs_trans_item_committed(struct xfs_log_item *lip, - xfs_lsn_t commit_lsn, int aborted); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, int aborted); /* * AIL traversal cursor. * @@ -73,12 +75,29 @@ struct xfs_ail { /* * From xfs_trans_ail.c */ -void xfs_trans_ail_update(struct xfs_ail *ailp, - struct xfs_log_item *lip, xfs_lsn_t lsn) - __releases(ailp->xa_lock); -void xfs_trans_ail_delete(struct xfs_ail *ailp, - struct xfs_log_item *lip) - __releases(ailp->xa_lock); +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock); +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn); +} + +void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, + struct xfs_log_item **log_items, int nr_items) + __releases(ailp->xa_lock); +static inline void +xfs_trans_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip) __releases(ailp->xa_lock) +{ + xfs_trans_ail_delete_bulk(ailp, &lip, 1); +} + void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_trans_unlocked_item(struct xfs_ail *, xfs_log_item_t *); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 8e4a63c4151..d8e6f8cd6f0 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -964,29 +964,48 @@ xfs_release( xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); } - if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || - ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + if (ip->i_d.di_nlink == 0) + return 0; - /* - * If we can't get the iolock just skip truncating - * the blocks past EOF because we could deadlock - * with the mmap_sem otherwise. We'll get another - * chance to drop them once the last reference to - * the inode is dropped, so we'll never leak blocks - * permanently. - */ - error = xfs_free_eofblocks(mp, ip, - XFS_FREE_EOF_TRYLOCK); - if (error) - return error; - } - } + if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || + ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS)) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, check if the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * oustanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first dirty close we will still remove + * the speculative allocation, but after that we will leave it + * in place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + + error = xfs_free_eofblocks(mp, ip, + XFS_FREE_EOF_TRYLOCK); + if (error) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); + } return 0; } |