diff options
Diffstat (limited to 'fs/xfs/xfs_buf.c')
-rw-r--r-- | fs/xfs/xfs_buf.c | 593 |
1 files changed, 252 insertions, 341 deletions
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6819b5163e3..172d3cc8f8c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -35,14 +35,12 @@ #include <linux/freezer.h> #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" static kmem_zone_t *xfs_buf_zone; -STATIC int xfsbufd(void *); static struct workqueue_struct *xfslogd_workqueue; @@ -57,11 +55,7 @@ static struct workqueue_struct *xfslogd_workqueue; #endif #define xb_to_gfp(flags) \ - ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ - ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) - -#define xb_to_km(flags) \ - (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) static inline int @@ -71,11 +65,11 @@ xfs_buf_is_vmapped( /* * Return true if the buffer is vmapped. * - * The XBF_MAPPED flag is set if the buffer should be mapped, but the - * code is clever enough to know it doesn't have to map a single page, - * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. + * b_addr is null if the buffer is not mapped, but the code is clever + * enough to know it doesn't have to map a single page, so the check has + * to be both for b_addr and bp->b_page_count > 1. */ - return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; + return bp->b_addr && bp->b_page_count > 1; } static inline int @@ -144,8 +138,17 @@ void xfs_buf_stale( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + bp->b_flags |= XBF_STALE; - xfs_buf_delwri_dequeue(bp); + + /* + * Clear the delwri status so that a delwri queue walker will not + * flush this buffer to disk now that it is stale. The delwri queue has + * a reference to the buffer, so this is safe to do. + */ + bp->b_flags &= ~_XBF_DELWRI_Q; + atomic_set(&(bp)->b_lru_ref, 0); if (!list_empty(&bp->b_lru)) { struct xfs_buftarg *btp = bp->b_target; @@ -164,22 +167,22 @@ xfs_buf_stale( struct xfs_buf * xfs_buf_alloc( struct xfs_buftarg *target, - xfs_off_t range_base, - size_t range_length, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; - bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); + bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) return NULL; /* - * We don't want certain flags to appear in b_flags. + * We don't want certain flags to appear in b_flags unless they are + * specifically set by later operations on the buffer. */ - flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); + flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); @@ -189,14 +192,22 @@ xfs_buf_alloc( sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; - bp->b_file_offset = range_base; + /* - * Set buffer_length and count_desired to the same value initially. - * I/O routines should use count_desired, which will be the same in + * Set length and io_length to the same value initially. + * I/O routines should use io_length, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ - bp->b_buffer_length = bp->b_count_desired = range_length; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_flags = flags; + + /* + * We do not set the block number here in the buffer because we have not + * finished initialising the buffer. We insert the buffer into the cache + * in this state, so this ensures that we are unable to do IO on a + * buffer that hasn't been fully initialised. + */ bp->b_bn = XFS_BUF_DADDR_NULL; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -219,13 +230,12 @@ _xfs_buf_get_pages( { /* Make sure that we have a page list */ if (bp->b_pages == NULL) { - bp->b_offset = xfs_buf_poff(bp->b_file_offset); bp->b_page_count = page_count; if (page_count <= XB_PAGES) { bp->b_pages = bp->b_page_array; } else { bp->b_pages = kmem_alloc(sizeof(struct page *) * - page_count, xb_to_km(flags)); + page_count, KM_NOFS); if (bp->b_pages == NULL) return -ENOMEM; } @@ -288,11 +298,11 @@ xfs_buf_allocate_memory( xfs_buf_t *bp, uint flags) { - size_t size = bp->b_count_desired; + size_t size; size_t nbytes, offset; gfp_t gfp_mask = xb_to_gfp(flags); unsigned short page_count, i; - xfs_off_t end; + xfs_off_t start, end; int error; /* @@ -300,15 +310,15 @@ xfs_buf_allocate_memory( * the memory from the heap - there's no need for the complexity of * page arrays to keep allocation down to order 0. */ - if (bp->b_buffer_length < PAGE_SIZE) { - bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); + size = BBTOB(bp->b_length); + if (size < PAGE_SIZE) { + bp->b_addr = kmem_alloc(size, KM_NOFS); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; } - if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & - PAGE_MASK) != + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != ((unsigned long)bp->b_addr & PAGE_MASK)) { /* b_addr spans two pages - use alloc_page instead */ kmem_free(bp->b_addr); @@ -319,13 +329,14 @@ xfs_buf_allocate_memory( bp->b_pages = bp->b_page_array; bp->b_pages[0] = virt_to_page(bp->b_addr); bp->b_page_count = 1; - bp->b_flags |= XBF_MAPPED | _XBF_KMEM; + bp->b_flags |= _XBF_KMEM; return 0; } use_alloc_page: - end = bp->b_file_offset + bp->b_buffer_length; - page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); + start = BBTOB(bp->b_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; @@ -388,8 +399,9 @@ _xfs_buf_map_pages( if (bp->b_page_count == 1) { /* A single page buffer is always mappable */ bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; - bp->b_flags |= XBF_MAPPED; - } else if (flags & XBF_MAPPED) { + } else if (flags & XBF_UNMAPPED) { + bp->b_addr = NULL; + } else { int retried = 0; do { @@ -403,7 +415,6 @@ _xfs_buf_map_pages( if (!bp->b_addr) return -ENOMEM; bp->b_addr += bp->b_offset; - bp->b_flags |= XBF_MAPPED; } return 0; @@ -420,29 +431,27 @@ _xfs_buf_map_pages( */ xfs_buf_t * _xfs_buf_find( - xfs_buftarg_t *btp, /* block device target */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + struct xfs_buftarg *btp, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags, xfs_buf_t *new_bp) { - xfs_off_t range_base; - size_t range_length; + size_t numbytes; struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent; xfs_buf_t *bp; - range_base = (ioff << BBSHIFT); - range_length = (isize << BBSHIFT); + numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(range_length < (1 << btp->bt_sshift))); - ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < (1 << btp->bt_sshift))); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, ioff)); + xfs_daddr_to_agno(btp->bt_mount, blkno)); /* walk tree */ spin_lock(&pag->pag_buf_lock); @@ -453,20 +462,20 @@ _xfs_buf_find( parent = *rbp; bp = rb_entry(parent, struct xfs_buf, b_rbnode); - if (range_base < bp->b_file_offset) + if (blkno < bp->b_bn) rbp = &(*rbp)->rb_left; - else if (range_base > bp->b_file_offset) + else if (blkno > bp->b_bn) rbp = &(*rbp)->rb_right; else { /* - * found a block offset match. If the range doesn't + * found a block number match. If the range doesn't * match, the only way this is allowed is if the buffer * in the cache is stale and the transaction that made * it stale has not yet committed. i.e. we are * reallocating a busy extent. Skip this buffer and * continue searching to the right for an exact match. */ - if (bp->b_buffer_length != range_length) { + if (bp->b_length != numblks) { ASSERT(bp->b_flags & XBF_STALE); rbp = &(*rbp)->rb_right; continue; @@ -511,7 +520,7 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM | _XBF_PAGES; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -526,63 +535,59 @@ found: */ struct xfs_buf * xfs_buf_get( - xfs_buftarg_t *target,/* target for buffer */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + xfs_buftarg_t *target, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, ioff, isize, flags, NULL); + bp = _xfs_buf_find(target, blkno, numblks, flags, NULL); if (likely(bp)) goto found; - new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, - flags); + new_bp = xfs_buf_alloc(target, blkno, numblks, flags); if (unlikely(!new_bp)) return NULL; - bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); - if (!bp) { + error = xfs_buf_allocate_memory(new_bp, flags); + if (error) { kmem_zone_free(xfs_buf_zone, new_bp); return NULL; } - if (bp == new_bp) { - error = xfs_buf_allocate_memory(bp, flags); - if (error) - goto no_buffer; - } else - kmem_zone_free(xfs_buf_zone, new_bp); + bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp); + if (!bp) { + xfs_buf_free(new_bp); + return NULL; + } + + if (bp != new_bp) + xfs_buf_free(new_bp); /* * Now we have a workable buffer, fill in the block number so * that we can do IO on it. */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; + bp->b_bn = blkno; + bp->b_io_length = bp->b_length; found: - if (!(bp->b_flags & XBF_MAPPED)) { + if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } } XFS_STATS_INC(xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; - -no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } STATIC int @@ -590,32 +595,30 @@ _xfs_buf_read( xfs_buf_t *bp, xfs_buf_flags_t flags) { - int status; - - ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); + ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - status = xfs_buf_iorequest(bp); - if (status || bp->b_error || (flags & XBF_ASYNC)) - return status; + xfs_buf_iorequest(bp); + if (flags & XBF_ASYNC) + return 0; return xfs_buf_iowait(bp); } xfs_buf_t * xfs_buf_read( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { xfs_buf_t *bp; flags |= XBF_READ; - bp = xfs_buf_get(target, ioff, isize, flags); + bp = xfs_buf_get(target, blkno, numblks, flags); if (bp) { trace_xfs_buf_read(bp, flags, _RET_IP_); @@ -627,7 +630,8 @@ xfs_buf_read( * Read ahead call which is already satisfied, * drop the buffer */ - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } else { /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; @@ -635,12 +639,6 @@ xfs_buf_read( } return bp; - - no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } /* @@ -650,14 +648,14 @@ xfs_buf_read( void xfs_buf_readahead( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize) + xfs_daddr_t blkno, + size_t numblks) { if (bdi_read_congested(target->bt_bdi)) return; - xfs_buf_read(target, ioff, isize, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); + xfs_buf_read(target, blkno, numblks, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); } /* @@ -666,16 +664,15 @@ xfs_buf_readahead( */ struct xfs_buf * xfs_buf_read_uncached( - struct xfs_mount *mp, struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t length, + size_t numblks, int flags) { xfs_buf_t *bp; int error; - bp = xfs_buf_get_uncached(target, length, flags); + bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) return NULL; @@ -683,9 +680,9 @@ xfs_buf_read_uncached( XFS_BUF_SET_ADDR(bp, daddr); XFS_BUF_READ(bp); - xfsbdstrat(mp, bp); + xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); - if (error || bp->b_error) { + if (error) { xfs_buf_relse(bp); return NULL; } @@ -699,7 +696,7 @@ xfs_buf_read_uncached( void xfs_buf_set_empty( struct xfs_buf *bp, - size_t len) + size_t numblks) { if (bp->b_pages) _xfs_buf_free_pages(bp); @@ -707,10 +704,9 @@ xfs_buf_set_empty( bp->b_pages = NULL; bp->b_page_count = 0; bp->b_addr = NULL; - bp->b_file_offset = 0; - bp->b_buffer_length = bp->b_count_desired = len; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_bn = XFS_BUF_DADDR_NULL; - bp->b_flags &= ~XBF_MAPPED; } static inline struct page * @@ -749,7 +745,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); + rval = _xfs_buf_get_pages(bp, page_count, 0); if (rval) return rval; @@ -760,9 +756,8 @@ xfs_buf_associate_memory( pageaddr += PAGE_SIZE; } - bp->b_count_desired = len; - bp->b_buffer_length = buflen; - bp->b_flags |= XBF_MAPPED; + bp->b_io_length = BTOBB(len); + bp->b_length = BTOBB(buflen); return 0; } @@ -770,17 +765,18 @@ xfs_buf_associate_memory( xfs_buf_t * xfs_buf_get_uncached( struct xfs_buftarg *target, - size_t len, + size_t numblks, int flags) { - unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; + unsigned long page_count; int error, i; xfs_buf_t *bp; - bp = xfs_buf_alloc(target, 0, len, 0); + bp = xfs_buf_alloc(target, 0, numblks, 0); if (unlikely(bp == NULL)) goto fail; + page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; error = _xfs_buf_get_pages(bp, page_count, 0); if (error) goto fail_free_buf; @@ -792,7 +788,7 @@ xfs_buf_get_uncached( } bp->b_flags |= _XBF_PAGES; - error = _xfs_buf_map_pages(bp, XBF_MAPPED); + error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); @@ -855,7 +851,7 @@ xfs_buf_rele( spin_unlock(&pag->pag_buf_lock); } else { xfs_buf_lru_del(bp); - ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); @@ -915,13 +911,6 @@ xfs_buf_lock( trace_xfs_buf_lock_done(bp, _RET_IP_); } -/* - * Releases the lock on the buffer object. - * If the buffer is marked delwri but is not queued, do so before we - * unlock the buffer as we need to set flags correctly. We also need to - * take a reference for the delwri queue because the unlocker is going to - * drop their's and they don't know we just queued it. - */ void xfs_buf_unlock( struct xfs_buf *bp) @@ -1008,9 +997,8 @@ xfs_buf_ioerror_alert( const char *func) { xfs_alert(bp->b_target->bt_mount, -"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", - (__uint64_t)XFS_BUF_ADDR(bp), func, - bp->b_error, XFS_BUF_COUNT(bp)); +"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", + (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); } int @@ -1019,10 +1007,11 @@ xfs_bwrite( { int error; + ASSERT(xfs_buf_islocked(bp)); + bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); - xfs_buf_delwri_dequeue(bp); xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); @@ -1181,7 +1170,7 @@ _xfs_buf_ioapply( int rw, map_i, total_nr_pages, nr_pages; struct bio *bio; int offset = bp->b_offset; - int size = bp->b_count_desired; + int size = BBTOB(bp->b_io_length); sector_t sector = bp->b_bn; total_nr_pages = bp->b_page_count; @@ -1229,7 +1218,7 @@ next_chunk: break; offset = 0; - sector += nbytes >> BBSHIFT; + sector += BTOBB(nbytes); size -= nbytes; total_nr_pages--; } @@ -1248,13 +1237,13 @@ next_chunk: } } -int +void xfs_buf_iorequest( xfs_buf_t *bp) { trace_xfs_buf_iorequest(bp, _RET_IP_); - ASSERT(!(bp->b_flags & XBF_DELWRI)); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); @@ -1269,13 +1258,12 @@ xfs_buf_iorequest( _xfs_buf_ioend(bp, 0); xfs_buf_rele(bp); - return 0; } /* - * Waits for I/O to complete on the buffer supplied. - * It returns immediately if no I/O is pending. - * It returns the I/O error code, if any, or 0 if there was no error. + * Waits for I/O to complete on the buffer supplied. It returns immediately if + * no I/O is pending or there is already a pending error on the buffer. It + * returns the I/O error code, if any, or 0 if there was no error. */ int xfs_buf_iowait( @@ -1283,7 +1271,8 @@ xfs_buf_iowait( { trace_xfs_buf_iowait(bp, _RET_IP_); - wait_for_completion(&bp->b_iowait); + if (!bp->b_error) + wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); return bp->b_error; @@ -1296,7 +1285,7 @@ xfs_buf_offset( { struct page *page; - if (bp->b_flags & XBF_MAPPED) + if (bp->b_addr) return bp->b_addr + offset; offset += bp->b_offset; @@ -1315,27 +1304,30 @@ xfs_buf_iomove( void *data, /* data address */ xfs_buf_rw_t mode) /* read/write/zero flag */ { - size_t bend, cpoff, csize; - struct page *page; + size_t bend; bend = boff + bsize; while (boff < bend) { - page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; - cpoff = xfs_buf_poff(boff + bp->b_offset); - csize = min_t(size_t, - PAGE_SIZE-cpoff, bp->b_count_desired-boff); + struct page *page; + int page_index, page_offset, csize; + + page_index = (boff + bp->b_offset) >> PAGE_SHIFT; + page_offset = (boff + bp->b_offset) & ~PAGE_MASK; + page = bp->b_pages[page_index]; + csize = min_t(size_t, PAGE_SIZE - page_offset, + BBTOB(bp->b_io_length) - boff); - ASSERT(((csize + cpoff) <= PAGE_SIZE)); + ASSERT((csize + page_offset) <= PAGE_SIZE); switch (mode) { case XBRW_ZERO: - memset(page_address(page) + cpoff, 0, csize); + memset(page_address(page) + page_offset, 0, csize); break; case XBRW_READ: - memcpy(data, page_address(page) + cpoff, csize); + memcpy(data, page_address(page) + page_offset, csize); break; case XBRW_WRITE: - memcpy(page_address(page) + cpoff, data, csize); + memcpy(page_address(page) + page_offset, data, csize); } boff += csize; @@ -1435,11 +1427,9 @@ xfs_free_buftarg( { unregister_shrinker(&btp->bt_shrinker); - xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - kthread_stop(btp->bt_task); kmem_free(btp); } @@ -1491,20 +1481,6 @@ xfs_setsize_buftarg( return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); } -STATIC int -xfs_alloc_delwri_queue( - xfs_buftarg_t *btp, - const char *fsname) -{ - INIT_LIST_HEAD(&btp->bt_delwri_queue); - spin_lock_init(&btp->bt_delwri_lock); - btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); - if (IS_ERR(btp->bt_task)) - return PTR_ERR(btp->bt_task); - return 0; -} - xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, @@ -1527,8 +1503,6 @@ xfs_alloc_buftarg( spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_alloc_delwri_queue(btp, fsname)) - goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; btp->bt_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&btp->bt_shrinker); @@ -1539,125 +1513,52 @@ error: return NULL; } - /* - * Delayed write buffer handling + * Add a buffer to the delayed write list. + * + * This queues a buffer for writeout if it hasn't already been. Note that + * neither this routine nor the buffer list submission functions perform + * any internal synchronization. It is expected that the lists are thread-local + * to the callers. + * + * Returns true if we queued up the buffer, or false if it already had + * been on the buffer list. */ -void +bool xfs_buf_delwri_queue( - xfs_buf_t *bp) + struct xfs_buf *bp, + struct list_head *list) { - struct xfs_buftarg *btp = bp->b_target; - - trace_xfs_buf_delwri_queue(bp, _RET_IP_); - + ASSERT(xfs_buf_islocked(bp)); ASSERT(!(bp->b_flags & XBF_READ)); - spin_lock(&btp->bt_delwri_lock); - if (!list_empty(&bp->b_list)) { - /* if already in the queue, move it to the tail */ - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_move_tail(&bp->b_list, &btp->bt_delwri_queue); - } else { - /* start xfsbufd as it is about to have something to do */ - if (list_empty(&btp->bt_delwri_queue)) - wake_up_process(bp->b_target->bt_task); - - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; - list_add_tail(&bp->b_list, &btp->bt_delwri_queue); - } - bp->b_queuetime = jiffies; - spin_unlock(&btp->bt_delwri_lock); -} - -void -xfs_buf_delwri_dequeue( - xfs_buf_t *bp) -{ - int dequeued = 0; - - spin_lock(&bp->b_target->bt_delwri_lock); - if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_del_init(&bp->b_list); - dequeued = 1; + /* + * If the buffer is already marked delwri it already is queued up + * by someone else for imediate writeout. Just ignore it in that + * case. + */ + if (bp->b_flags & _XBF_DELWRI_Q) { + trace_xfs_buf_delwri_queued(bp, _RET_IP_); + return false; } - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(&bp->b_target->bt_delwri_lock); - - if (dequeued) - xfs_buf_rele(bp); - - trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); -} - -/* - * If a delwri buffer needs to be pushed before it has aged out, then promote - * it to the head of the delwri queue so that it will be flushed on the next - * xfsbufd run. We do this by resetting the queuetime of the buffer to be older - * than the age currently needed to flush the buffer. Hence the next time the - * xfsbufd sees it is guaranteed to be considered old enough to flush. - */ -void -xfs_buf_delwri_promote( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; - ASSERT(bp->b_flags & XBF_DELWRI); - ASSERT(bp->b_flags & _XBF_DELWRI_Q); + trace_xfs_buf_delwri_queue(bp, _RET_IP_); /* - * Check the buffer age before locking the delayed write queue as we - * don't need to promote buffers that are already past the flush age. + * If a buffer gets written out synchronously or marked stale while it + * is on a delwri list we lazily remove it. To do this, the other party + * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. + * It remains referenced and on the list. In a rare corner case it + * might get readded to a delwri list after the synchronous writeout, in + * which case we need just need to re-add the flag here. */ - if (bp->b_queuetime < jiffies - age) - return; - bp->b_queuetime = jiffies - age; - spin_lock(&btp->bt_delwri_lock); - list_move(&bp->b_list, &btp->bt_delwri_queue); - spin_unlock(&btp->bt_delwri_lock); -} - -/* - * Move as many buffers as specified to the supplied list - * idicating if we skipped any buffers to prevent deadlocks. - */ -STATIC int -xfs_buf_delwri_split( - xfs_buftarg_t *target, - struct list_head *list, - unsigned long age) -{ - xfs_buf_t *bp, *n; - int skipped = 0; - int force; - - force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); - INIT_LIST_HEAD(list); - spin_lock(&target->bt_delwri_lock); - list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { - ASSERT(bp->b_flags & XBF_DELWRI); - - if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { - if (!force && - time_before(jiffies, bp->b_queuetime + age)) { - xfs_buf_unlock(bp); - break; - } - - bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q); - bp->b_flags |= XBF_WRITE; - list_move_tail(&bp->b_list, list); - trace_xfs_buf_delwri_split(bp, _RET_IP_); - } else - skipped++; + bp->b_flags |= _XBF_DELWRI_Q; + if (list_empty(&bp->b_list)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_list, list); } - spin_unlock(&target->bt_delwri_lock); - return skipped; + return true; } /* @@ -1683,99 +1584,109 @@ xfs_buf_cmp( return 0; } -STATIC int -xfsbufd( - void *data) +static int +__xfs_buf_delwri_submit( + struct list_head *buffer_list, + struct list_head *io_list, + bool wait) { - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - - current->flags |= PF_MEMALLOC; - - set_freezable(); + struct blk_plug plug; + struct xfs_buf *bp, *n; + int pinned = 0; + + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + if (!wait) { + if (xfs_buf_ispinned(bp)) { + pinned++; + continue; + } + if (!xfs_buf_trylock(bp)) + continue; + } else { + xfs_buf_lock(bp); + } - do { - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); - long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); - struct list_head tmp; - struct blk_plug plug; + /* + * Someone else might have written the buffer synchronously or + * marked it stale in the meantime. In that case only the + * _XBF_DELWRI_Q flag got cleared, and we have to drop the + * reference and remove it from the list here. + */ + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + continue; + } - if (unlikely(freezing(current))) - try_to_freeze(); + list_move_tail(&bp->b_list, io_list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); + } - /* sleep for a long time if there is nothing to do. */ - if (list_empty(&target->bt_delwri_queue)) - tout = MAX_SCHEDULE_TIMEOUT; - schedule_timeout_interruptible(tout); + list_sort(NULL, io_list, xfs_buf_cmp); - xfs_buf_delwri_split(target, &tmp, age); - list_sort(NULL, &tmp, xfs_buf_cmp); + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, io_list, b_list) { + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags |= XBF_WRITE; - blk_start_plug(&plug); - while (!list_empty(&tmp)) { - struct xfs_buf *bp; - bp = list_first_entry(&tmp, struct xfs_buf, b_list); + if (!wait) { + bp->b_flags |= XBF_ASYNC; list_del_init(&bp->b_list); - xfs_bdstrat_cb(bp); } - blk_finish_plug(&plug); - } while (!kthread_should_stop()); + xfs_bdstrat_cb(bp); + } + blk_finish_plug(&plug); - return 0; + return pinned; } /* - * Go through all incore buffers, and release buffers if they belong to - * the given device. This is used in filesystem error handling to - * preserve the consistency of its metadata. + * Write out a buffer list asynchronously. + * + * This will take the @buffer_list, write all non-locked and non-pinned buffers + * out and not wait for I/O completion on any of the buffers. This interface + * is only safely useable for callers that can track I/O completion by higher + * level means, e.g. AIL pushing as the @buffer_list is consumed in this + * function. */ int -xfs_flush_buftarg( - xfs_buftarg_t *target, - int wait) +xfs_buf_delwri_submit_nowait( + struct list_head *buffer_list) { - xfs_buf_t *bp; - int pincount = 0; - LIST_HEAD(tmp_list); - LIST_HEAD(wait_list); - struct blk_plug plug; + LIST_HEAD (io_list); + return __xfs_buf_delwri_submit(buffer_list, &io_list, false); +} - flush_workqueue(xfslogd_workqueue); +/* + * Write out a buffer list synchronously. + * + * This will take the @buffer_list, write all buffers out and wait for I/O + * completion on all of the buffers. @buffer_list is consumed by the function, + * so callers must have some other way of tracking buffers if they require such + * functionality. + */ +int +xfs_buf_delwri_submit( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + int error = 0, error2; + struct xfs_buf *bp; - set_bit(XBT_FORCE_FLUSH, &target->bt_flags); - pincount = xfs_buf_delwri_split(target, &tmp_list, 0); + __xfs_buf_delwri_submit(buffer_list, &io_list, true); - /* - * Dropped the delayed write list lock, now walk the temporary list. - * All I/O is issued async and then if we need to wait for completion - * we do that after issuing all the IO. - */ - list_sort(NULL, &tmp_list, xfs_buf_cmp); + /* Wait for IO to complete. */ + while (!list_empty(&io_list)) { + bp = list_first_entry(&io_list, struct xfs_buf, b_list); - blk_start_plug(&plug); - while (!list_empty(&tmp_list)) { - bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); - ASSERT(target == bp->b_target); list_del_init(&bp->b_list); - if (wait) { - bp->b_flags &= ~XBF_ASYNC; - list_add(&bp->b_list, &wait_list); - } - xfs_bdstrat_cb(bp); - } - blk_finish_plug(&plug); - - if (wait) { - /* Wait for IO to complete. */ - while (!list_empty(&wait_list)) { - bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - - list_del_init(&bp->b_list); - xfs_buf_iowait(bp); - xfs_buf_relse(bp); - } + error2 = xfs_buf_iowait(bp); + xfs_buf_relse(bp); + if (!error) + error = error2; } - return pincount; + return error; } int __init |