diff options
Diffstat (limited to 'fs/xfs')
81 files changed, 2493 insertions, 2887 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 0a9977983f9..d2bf974b1a2 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -33,6 +33,7 @@ xfs-y += xfs_aops.o \ xfs_discard.o \ xfs_error.o \ xfs_export.o \ + xfs_extent_busy.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ @@ -49,7 +50,6 @@ xfs-y += xfs_aops.o \ xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ - xfs_rw.o \ xfs_utils.o \ xfs_vnodeops.o \ kmem.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index a907de565db..4a7286c1dc8 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -46,7 +46,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) } void * -kmem_alloc(size_t size, unsigned int __nocast flags) +kmem_alloc(size_t size, xfs_km_flags_t flags) { int retries = 0; gfp_t lflags = kmem_flags_convert(flags); @@ -65,7 +65,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags) } void * -kmem_zalloc(size_t size, unsigned int __nocast flags) +kmem_zalloc(size_t size, xfs_km_flags_t flags) { void *ptr; @@ -87,7 +87,7 @@ kmem_free(const void *ptr) void * kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, - unsigned int __nocast flags) + xfs_km_flags_t flags) { void *new; @@ -102,7 +102,7 @@ kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, } void * -kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) +kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) { int retries = 0; gfp_t lflags = kmem_flags_convert(flags); @@ -121,7 +121,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) } void * -kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags) +kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) { void *ptr; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index ab7c53fe346..b2f2620f9a8 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -27,10 +27,11 @@ * General memory allocation interfaces */ -#define KM_SLEEP 0x0001u -#define KM_NOSLEEP 0x0002u -#define KM_NOFS 0x0004u -#define KM_MAYFAIL 0x0008u +typedef unsigned __bitwise xfs_km_flags_t; +#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) +#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) +#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) +#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) /* * We use a special process flag to avoid recursive callbacks into @@ -38,7 +39,7 @@ * warnings, so we explicitly skip any generic ones (silly of us). */ static inline gfp_t -kmem_flags_convert(unsigned int __nocast flags) +kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; @@ -54,9 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags) return lflags; } -extern void *kmem_alloc(size_t, unsigned int __nocast); -extern void *kmem_zalloc(size_t, unsigned int __nocast); -extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); +extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_zalloc(size_t, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); extern void kmem_free(const void *); static inline void *kmem_zalloc_large(size_t size) @@ -107,7 +108,7 @@ kmem_zone_destroy(kmem_zone_t *zone) kmem_cache_destroy(zone); } -extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); -extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); +extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); +extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t); #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 4805f009f92..44d65c1533c 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,24 +175,6 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that - * have been freed but whose transactions aren't committed to disk yet. - * - * Note that we use the transaction ID to record the transaction, not the - * transaction structure itself. See xfs_alloc_busy_insert() for details. - */ -struct xfs_busy_extent { - struct rb_node rb_node; /* ag by-bno indexed search tree */ - struct list_head list; /* transaction busy extent list */ - xfs_agnumber_t agno; - xfs_agblock_t bno; - xfs_extlen_t length; - unsigned int flags; -#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ -#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */ -}; - -/* * Per-ag incore structure, copies of information in agf and agi, * to improve the performance of allocation group selection. */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0f0df2759b0..229641fb8e6 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -32,6 +31,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -47,8 +47,6 @@ STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); -STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *, - xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *); /* * Lookup the record equal to [bno, len] in the btree given by cur. @@ -152,7 +150,7 @@ xfs_alloc_compute_aligned( xfs_extlen_t len; /* Trim busy sections out of found extent */ - xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len); + xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); @@ -536,7 +534,7 @@ xfs_alloc_ag_vextent( if (error) return error; - ASSERT(!xfs_alloc_busy_search(args->mp, args->agno, + ASSERT(!xfs_extent_busy_search(args->mp, args->agno, args->agbno, args->len)); } @@ -603,7 +601,7 @@ xfs_alloc_ag_vextent_exact( /* * Check for overlapping busy extents. */ - xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen); + xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -1391,7 +1389,7 @@ xfs_alloc_ag_vextent_small( if (error) goto error0; if (fbno != NULLAGBLOCK) { - xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1, + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, args->userdata); if (args->userdata) { @@ -2496,579 +2494,8 @@ xfs_free_extent( error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); if (!error) - xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0); + xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); error0: xfs_perag_put(args.pag); return error; } - -void -xfs_alloc_busy_insert( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - unsigned int flags) -{ - struct xfs_busy_extent *new; - struct xfs_busy_extent *busyp; - struct xfs_perag *pag; - struct rb_node **rbp; - struct rb_node *parent = NULL; - - new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); - if (!new) { - /* - * No Memory! Since it is now not possible to track the free - * block, make this a synchronous transaction to insure that - * the block is not reused before this transaction commits. - */ - trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len); - xfs_trans_set_sync(tp); - return; - } - - new->agno = agno; - new->bno = bno; - new->length = len; - INIT_LIST_HEAD(&new->list); - new->flags = flags; - - /* trace before insert to be able to see failed inserts */ - trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); - - pag = xfs_perag_get(tp->t_mountp, new->agno); - spin_lock(&pag->pagb_lock); - rbp = &pag->pagb_tree.rb_node; - while (*rbp) { - parent = *rbp; - busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); - - if (new->bno < busyp->bno) { - rbp = &(*rbp)->rb_left; - ASSERT(new->bno + new->length <= busyp->bno); - } else if (new->bno > busyp->bno) { - rbp = &(*rbp)->rb_right; - ASSERT(bno >= busyp->bno + busyp->length); - } else { - ASSERT(0); - } - } - - rb_link_node(&new->rb_node, parent, rbp); - rb_insert_color(&new->rb_node, &pag->pagb_tree); - - list_add(&new->list, &tp->t_busy); - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); -} - -/* - * Search for a busy extent within the range of the extent we are about to - * allocate. You need to be holding the busy extent tree lock when calling - * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy - * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact - * match. This is done so that a non-zero return indicates an overlap that - * will require a synchronous transaction, but it can still be - * used to distinguish between a partial or exact match. - */ -int -xfs_alloc_busy_search( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) -{ - struct xfs_perag *pag; - struct rb_node *rbp; - struct xfs_busy_extent *busyp; - int match = 0; - - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); - - rbp = pag->pagb_tree.rb_node; - - /* find closest start bno overlap */ - while (rbp) { - busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); - if (bno < busyp->bno) { - /* may overlap, but exact start block is lower */ - if (bno + len > busyp->bno) - match = -1; - rbp = rbp->rb_left; - } else if (bno > busyp->bno) { - /* may overlap, but exact start block is higher */ - if (bno < busyp->bno + busyp->length) - match = -1; - rbp = rbp->rb_right; - } else { - /* bno matches busyp, length determines exact match */ - match = (busyp->length == len) ? 1 : -1; - break; - } - } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - return match; -} - -/* - * The found free extent [fbno, fend] overlaps part or all of the given busy - * extent. If the overlap covers the beginning, the end, or all of the busy - * extent, the overlapping portion can be made unbusy and used for the - * allocation. We can't split a busy extent because we can't modify a - * transaction/CIL context busy list, but we can update an entries block - * number or length. - * - * Returns true if the extent can safely be reused, or false if the search - * needs to be restarted. - */ -STATIC bool -xfs_alloc_busy_update_extent( - struct xfs_mount *mp, - struct xfs_perag *pag, - struct xfs_busy_extent *busyp, - xfs_agblock_t fbno, - xfs_extlen_t flen, - bool userdata) -{ - xfs_agblock_t fend = fbno + flen; - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - /* - * This extent is currently being discarded. Give the thread - * performing the discard a chance to mark the extent unbusy - * and retry. - */ - if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) { - spin_unlock(&pag->pagb_lock); - delay(1); - spin_lock(&pag->pagb_lock); - return false; - } - - /* - * If there is a busy extent overlapping a user allocation, we have - * no choice but to force the log and retry the search. - * - * Fortunately this does not happen during normal operation, but - * only if the filesystem is very low on space and has to dip into - * the AGFL for normal allocations. - */ - if (userdata) - goto out_force_log; - - if (bbno < fbno && bend > fend) { - /* - * Case 1: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - */ - - /* - * We would have to split the busy extent to be able to track - * it correct, which we cannot do because we would have to - * modify the list of busy extents attached to the transaction - * or CIL context, which is immutable. - * - * Force out the log to clear the busy extent and retry the - * search. - */ - goto out_force_log; - } else if (bbno >= fbno && bend <= fend) { - /* - * Case 2: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------+ - * fbno fend - * - * Case 3: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Case 4: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Case 5: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------------------------+ - * fbno fend - * - */ - - /* - * The busy extent is fully covered by the extent we are - * allocating, and can simply be removed from the rbtree. - * However we cannot remove it from the immutable list - * tracking busy extents in the transaction or CIL context, - * so set the length to zero to mark it invalid. - * - * We also need to restart the busy extent search from the - * tree root, because erasing the node can rearrange the - * tree topology. - */ - rb_erase(&busyp->rb_node, &pag->pagb_tree); - busyp->length = 0; - return false; - } else if (fend < bend) { - /* - * Case 6: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - * - * Case 7: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +------------------+ - * fbno fend - * - */ - busyp->bno = fend; - } else if (bbno < fbno) { - /* - * Case 8: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 9: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +----------------------+ - * fbno fend - */ - busyp->length = fbno - busyp->bno; - } else { - ASSERT(0); - } - - trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen); - return true; - -out_force_log: - spin_unlock(&pag->pagb_lock); - xfs_log_force(mp, XFS_LOG_SYNC); - trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen); - spin_lock(&pag->pagb_lock); - return false; -} - - -/* - * For a given extent [fbno, flen], make sure we can reuse it safely. - */ -void -xfs_alloc_busy_reuse( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agblock_t fbno, - xfs_extlen_t flen, - bool userdata) -{ - struct xfs_perag *pag; - struct rb_node *rbp; - - ASSERT(flen > 0); - - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); -restart: - rbp = pag->pagb_tree.rb_node; - while (rbp) { - struct xfs_busy_extent *busyp = - rb_entry(rbp, struct xfs_busy_extent, rb_node); - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - if (fbno + flen <= bbno) { - rbp = rbp->rb_left; - continue; - } else if (fbno >= bend) { - rbp = rbp->rb_right; - continue; - } - - if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen, - userdata)) - goto restart; - } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); -} - -/* - * For a given extent [fbno, flen], search the busy extent list to find a - * subset of the extent that is not busy. If *rlen is smaller than - * args->minlen no suitable extent could be found, and the higher level - * code needs to force out the log and retry the allocation. - */ -STATIC void -xfs_alloc_busy_trim( - struct xfs_alloc_arg *args, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_agblock_t *rbno, - xfs_extlen_t *rlen) -{ - xfs_agblock_t fbno; - xfs_extlen_t flen; - struct rb_node *rbp; - - ASSERT(len > 0); - - spin_lock(&args->pag->pagb_lock); -restart: - fbno = bno; - flen = len; - rbp = args->pag->pagb_tree.rb_node; - while (rbp && flen >= args->minlen) { - struct xfs_busy_extent *busyp = - rb_entry(rbp, struct xfs_busy_extent, rb_node); - xfs_agblock_t fend = fbno + flen; - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - if (fend <= bbno) { - rbp = rbp->rb_left; - continue; - } else if (fbno >= bend) { - rbp = rbp->rb_right; - continue; - } - - /* - * If this is a metadata allocation, try to reuse the busy - * extent instead of trimming the allocation. - */ - if (!args->userdata && - !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) { - if (!xfs_alloc_busy_update_extent(args->mp, args->pag, - busyp, fbno, flen, - false)) - goto restart; - continue; - } - - if (bbno <= fbno) { - /* start overlap */ - - /* - * Case 1: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - * - * Case 2: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 3: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 4: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------+ - * fbno fend - * - * No unbusy region in extent, return failure. - */ - if (fend <= bend) - goto fail; - - /* - * Case 5: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +----------------------+ - * fbno fend - * - * Case 6: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Needs to be trimmed to: - * +-------+ - * fbno fend - */ - fbno = bend; - } else if (bend >= fend) { - /* end overlap */ - - /* - * Case 7: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +------------------+ - * fbno fend - * - * Case 8: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Needs to be trimmed to: - * +-------+ - * fbno fend - */ - fend = bbno; - } else { - /* middle overlap */ - - /* - * Case 9: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------------------------+ - * fbno fend - * - * Can be trimmed to: - * +-------+ OR +-------+ - * fbno fend fbno fend - * - * Backward allocation leads to significant - * fragmentation of directories, which degrades - * directory performance, therefore we always want to - * choose the option that produces forward allocation - * patterns. - * Preferring the lower bno extent will make the next - * request use "fend" as the start of the next - * allocation; if the segment is no longer busy at - * that point, we'll get a contiguous allocation, but - * even if it is still busy, we will get a forward - * allocation. - * We try to avoid choosing the segment at "bend", - * because that can lead to the next allocation - * taking the segment at "fbno", which would be a - * backward allocation. We only use the segment at - * "fbno" if it is much larger than the current - * requested size, because in that case there's a - * good chance subsequent allocations will be - * contiguous. - */ - if (bbno - fbno >= args->maxlen) { - /* left candidate fits perfect */ - fend = bbno; - } else if (fend - bend >= args->maxlen * 4) { - /* right candidate has enough free space */ - fbno = bend; - } else if (bbno - fbno >= args->minlen) { - /* left candidate fits minimum requirement */ - fend = bbno; - } else { - goto fail; - } - } - - flen = fend - fbno; - } - spin_unlock(&args->pag->pagb_lock); - - if (fbno != bno || flen != len) { - trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, - fbno, flen); - } - *rbno = fbno; - *rlen = flen; - return; -fail: - /* - * Return a zero extent length as failure indications. All callers - * re-check if the trimmed extent satisfies the minlen requirement. - */ - spin_unlock(&args->pag->pagb_lock); - trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0); - *rbno = fbno; - *rlen = 0; -} - -static void -xfs_alloc_busy_clear_one( - struct xfs_mount *mp, - struct xfs_perag *pag, - struct xfs_busy_extent *busyp) -{ - if (busyp->length) { - trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno, - busyp->length); - rb_erase(&busyp->rb_node, &pag->pagb_tree); - } - - list_del_init(&busyp->list); - kmem_free(busyp); -} - -/* - * Remove all extents on the passed in list from the busy extents tree. - * If do_discard is set skip extents that need to be discarded, and mark - * these as undergoing a discard operation instead. - */ -void -xfs_alloc_busy_clear( - struct xfs_mount *mp, - struct list_head *list, - bool do_discard) -{ - struct xfs_busy_extent *busyp, *n; - struct xfs_perag *pag = NULL; - xfs_agnumber_t agno = NULLAGNUMBER; - - list_for_each_entry_safe(busyp, n, list, list) { - if (busyp->agno != agno) { - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); - agno = busyp->agno; - } - - if (do_discard && busyp->length && - !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD)) - busyp->flags = XFS_ALLOC_BUSY_DISCARDED; - else - xfs_alloc_busy_clear_one(mp, pag, busyp); - } - - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } -} - -/* - * Callback for list_sort to sort busy extents by the AG they reside in. - */ -int -xfs_busy_extent_ag_cmp( - void *priv, - struct list_head *a, - struct list_head *b) -{ - return container_of(a, struct xfs_busy_extent, list)->agno - - container_of(b, struct xfs_busy_extent, list)->agno; -} diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 3a7e7d8f8de..93be4a667ca 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -23,7 +23,6 @@ struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; -struct xfs_busy_extent; extern struct workqueue_struct *xfs_alloc_wq; @@ -139,33 +138,6 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag); -#ifdef __KERNEL__ -void -xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); - -void -xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list, - bool do_discard); - -int -xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); - -void -xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); - -int -xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b); - -static inline void xfs_alloc_busy_sort(struct list_head *list) -{ - list_sort(NULL, list, xfs_busy_extent_ag_cmp); -} - -#endif /* __KERNEL__ */ - /* * Compute and fill in value of m_ag_maxlevels. */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index ffb3386e45c..f1647caace8 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -32,6 +30,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -94,7 +93,7 @@ xfs_allocbt_alloc_block( return 0; } - xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -119,8 +118,8 @@ xfs_allocbt_free_block( if (error) return error; - xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, - XFS_ALLOC_BUSY_SKIP_DISCARD); + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0dbb9e70fe2..ae31c313a79 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -16,9 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_trans.h" @@ -29,7 +27,6 @@ #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_iomap.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -623,7 +620,7 @@ xfs_map_at_offset( * or delayed allocate extent. */ STATIC int -xfs_is_delayed_page( +xfs_check_page_type( struct page *page, unsigned int type) { @@ -637,11 +634,11 @@ xfs_is_delayed_page( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable = (type == IO_UNWRITTEN); + acceptable += (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IO_DELALLOC); + acceptable += (type == IO_DELALLOC); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IO_OVERWRITE); + acceptable += (type == IO_OVERWRITE); else break; } while ((bh = bh->b_this_page) != head); @@ -684,7 +681,7 @@ xfs_convert_page( goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; - if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) + if (!xfs_check_page_type(page, (*ioendp)->io_type)) goto fail_unlock_page; /* @@ -834,7 +831,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_is_delayed_page(page, IO_DELALLOC)) + if (!xfs_check_page_type(page, IO_DELALLOC)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -1146,7 +1143,14 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; - if (create) { + /* + * Direct I/O is usually done on preallocated files, so try getting + * a block mapping without an exclusive lock first. For buffered + * writes we already have the exclusive iolock anyway, so avoiding + * a lock roundtrip here by taking the ilock exclusive from the + * beginning is a useful micro optimization. + */ + if (create && !direct) { lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); } else { @@ -1168,23 +1172,45 @@ __xfs_get_blocks( (!nimaps || (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK))) { - if (direct) { + if (direct || xfs_get_extsz_hint(ip)) { + /* + * Drop the ilock in preparation for starting the block + * allocation transaction. It will be retaken + * exclusively inside xfs_iomap_write_direct for the + * actual allocation. + */ + xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset, size, &imap, nimaps); + if (error) + return -error; + new = 1; } else { + /* + * Delalloc reservations do not require a transaction, + * we can go on without dropping the lock here. If we + * are allocating a new delalloc block, make sure that + * we set the new flag so that we mark the buffer new so + * that we know that it is newly allocated if the write + * fails. + */ + if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + new = 1; error = xfs_iomap_write_delay(ip, offset, size, &imap); + if (error) + goto out_unlock; + + xfs_iunlock(ip, lockmode); } - if (error) - goto out_unlock; trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); } else if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } - xfs_iunlock(ip, lockmode); if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK) { @@ -1386,52 +1412,91 @@ out_destroy_ioend: return ret; } +/* + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have made it to disk yet + * as the page is still locked at this point. + */ +STATIC void +xfs_vm_kill_delalloc_range( + struct inode *inode, + loff_t start, + loff_t end) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + start_fsb = XFS_B_TO_FSB(ip->i_mount, start); + end_fsb = XFS_B_TO_FSB(ip->i_mount, end); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + STATIC void xfs_vm_write_failed( - struct address_space *mapping, - loff_t to) + struct inode *inode, + struct page *page, + loff_t pos, + unsigned len) { - struct inode *inode = mapping->host; + loff_t block_offset = pos & PAGE_MASK; + loff_t block_start; + loff_t block_end; + loff_t from = pos & (PAGE_CACHE_SIZE - 1); + loff_t to = from + len; + struct buffer_head *bh, *head; - if (to > inode->i_size) { - /* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have - * made it to disk yet as the page is still locked at this - * point. - */ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; + ASSERT(block_offset + from == pos); - truncate_pagecache(inode, to, inode->i_size); + head = page_buffers(page); + block_start = 0; + for (bh = head; bh != head || !block_start; + bh = bh->b_this_page, block_start = block_end, + block_offset += bh->b_size) { + block_end = block_start + bh->b_size; - /* - * Check if there are any blocks that are outside of i_size - * that need to be trimmed back. - */ - start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; - end_fsb = XFS_B_TO_FSB(ip->i_mount, to); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* skip buffers before the write */ + if (block_end <= from) + continue; + + /* if the buffer is after the write, we're done */ + if (block_start >= to) + break; + + if (!buffer_delay(bh)) + continue; + + if (!buffer_new(bh) && block_offset < i_size_read(inode)) + continue; + + xfs_vm_kill_delalloc_range(inode, block_offset, + block_offset + bh->b_size); } + } +/* + * This used to call block_write_begin(), but it unlocks and releases the page + * on error, and we need that page to be able to punch stale delalloc blocks out + * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at + * the appropriate point. + */ STATIC int xfs_vm_write_begin( struct file *file, @@ -1442,15 +1507,40 @@ xfs_vm_write_begin( struct page **pagep, void **fsdata) { - int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int status; - ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS, - pagep, xfs_get_blocks); - if (unlikely(ret)) - xfs_vm_write_failed(mapping, pos + len); - return ret; + ASSERT(len <= PAGE_CACHE_SIZE); + + page = grab_cache_page_write_begin(mapping, index, + flags | AOP_FLAG_NOFS); + if (!page) + return -ENOMEM; + + status = __block_write_begin(page, pos, len, xfs_get_blocks); + if (unlikely(status)) { + struct inode *inode = mapping->host; + + xfs_vm_write_failed(inode, page, pos, len); + unlock_page(page); + + if (pos + len > i_size_read(inode)) + truncate_pagecache(inode, pos + len, i_size_read(inode)); + + page_cache_release(page); + page = NULL; + } + + *pagep = page; + return status; } +/* + * On failure, we only need to kill delalloc blocks beyond EOF because they + * will never be written. For blocks within EOF, generic_write_end() zeros them + * so they are safe to leave alone and be written with all the other valid data. + */ STATIC int xfs_vm_write_end( struct file *file, @@ -1463,9 +1553,19 @@ xfs_vm_write_end( { int ret; + ASSERT(len <= PAGE_CACHE_SIZE); + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) - xfs_vm_write_failed(mapping, pos + len); + if (unlikely(ret < len)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + loff_t to = pos + len; + + if (to > isize) { + truncate_pagecache(inode, to, isize); + xfs_vm_kill_delalloc_range(inode, isize, to); + } + } return ret; } diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 65d61b948ea..a17ff01b5ad 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -21,7 +21,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -39,7 +38,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" -#include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -1987,14 +1985,12 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XBF_LOCK | XBF_DONT_BLOCK, - &bp); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, blkcnt, 0, &bp); if (error) return(error); - tmp = (valuelen < XFS_BUF_SIZE(bp)) - ? valuelen : XFS_BUF_SIZE(bp); + tmp = min_t(int, valuelen, BBTOB(bp->b_length)); xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); xfs_buf_relse(bp); dst += tmp; @@ -2097,6 +2093,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) lblkno = args->rmtblkno; valuelen = args->valuelen; while (valuelen > 0) { + int buflen; + /* * Try to remember where we decided to put the value. */ @@ -2114,15 +2112,16 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, - XBF_LOCK | XBF_DONT_BLOCK); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); if (!bp) return ENOMEM; - tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : - XFS_BUF_SIZE(bp); + + buflen = BBTOB(bp->b_length); + tmp = min_t(int, valuelen, buflen); xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); - if (tmp < XFS_BUF_SIZE(bp)) - xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); + if (tmp < buflen) + xfs_buf_zero(bp, tmp, buflen - tmp); + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 76d93dc953e..7d89d800f51 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -2983,7 +2982,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, map.br_blockcount); bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, XBF_LOCK); + dblkno, dblkcnt, 0); if (!bp) return ENOMEM; xfs_trans_binval(*trans, bp); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 85e7e327bcd..58b815ec8c9 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -41,7 +41,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_attr_leaf.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" @@ -4527,7 +4526,7 @@ out_unreserve_blocks: xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ? + xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); return error; } @@ -5621,8 +5620,20 @@ xfs_getbmap( XFS_FSB_TO_BB(mp, map[i].br_blockcount); out[cur_ext].bmv_unused1 = 0; out[cur_ext].bmv_unused2 = 0; - ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || - (map[i].br_startblock != DELAYSTARTBLOCK)); + + /* + * delayed allocation extents that start beyond EOF can + * occur due to speculative EOF allocation when the + * delalloc extent is larger than the largest freespace + * extent at conversion time. These extents cannot be + * converted by data writeback, so can exist here even + * if we are not supposed to be finding delalloc + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && + map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + if (map[i].br_startblock == HOLESTARTBLOCK && whichfork == XFS_ATTR_FORK) { /* came to the end of attribute fork */ @@ -6157,3 +6168,16 @@ next_block: return error; } + +/* + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. + */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 89ee672d378..803b56d7ce1 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -211,6 +211,9 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); + +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index e2f5d59cbea..862084a47a7 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 1f19f03af9d..e53e317b158 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6819b5163e3..172d3cc8f8c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -35,14 +35,12 @@ #include <linux/freezer.h> #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" static kmem_zone_t *xfs_buf_zone; -STATIC int xfsbufd(void *); static struct workqueue_struct *xfslogd_workqueue; @@ -57,11 +55,7 @@ static struct workqueue_struct *xfslogd_workqueue; #endif #define xb_to_gfp(flags) \ - ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ - ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) - -#define xb_to_km(flags) \ - (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) static inline int @@ -71,11 +65,11 @@ xfs_buf_is_vmapped( /* * Return true if the buffer is vmapped. * - * The XBF_MAPPED flag is set if the buffer should be mapped, but the - * code is clever enough to know it doesn't have to map a single page, - * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. + * b_addr is null if the buffer is not mapped, but the code is clever + * enough to know it doesn't have to map a single page, so the check has + * to be both for b_addr and bp->b_page_count > 1. */ - return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; + return bp->b_addr && bp->b_page_count > 1; } static inline int @@ -144,8 +138,17 @@ void xfs_buf_stale( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + bp->b_flags |= XBF_STALE; - xfs_buf_delwri_dequeue(bp); + + /* + * Clear the delwri status so that a delwri queue walker will not + * flush this buffer to disk now that it is stale. The delwri queue has + * a reference to the buffer, so this is safe to do. + */ + bp->b_flags &= ~_XBF_DELWRI_Q; + atomic_set(&(bp)->b_lru_ref, 0); if (!list_empty(&bp->b_lru)) { struct xfs_buftarg *btp = bp->b_target; @@ -164,22 +167,22 @@ xfs_buf_stale( struct xfs_buf * xfs_buf_alloc( struct xfs_buftarg *target, - xfs_off_t range_base, - size_t range_length, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; - bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); + bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) return NULL; /* - * We don't want certain flags to appear in b_flags. + * We don't want certain flags to appear in b_flags unless they are + * specifically set by later operations on the buffer. */ - flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); + flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); @@ -189,14 +192,22 @@ xfs_buf_alloc( sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; - bp->b_file_offset = range_base; + /* - * Set buffer_length and count_desired to the same value initially. - * I/O routines should use count_desired, which will be the same in + * Set length and io_length to the same value initially. + * I/O routines should use io_length, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ - bp->b_buffer_length = bp->b_count_desired = range_length; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_flags = flags; + + /* + * We do not set the block number here in the buffer because we have not + * finished initialising the buffer. We insert the buffer into the cache + * in this state, so this ensures that we are unable to do IO on a + * buffer that hasn't been fully initialised. + */ bp->b_bn = XFS_BUF_DADDR_NULL; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -219,13 +230,12 @@ _xfs_buf_get_pages( { /* Make sure that we have a page list */ if (bp->b_pages == NULL) { - bp->b_offset = xfs_buf_poff(bp->b_file_offset); bp->b_page_count = page_count; if (page_count <= XB_PAGES) { bp->b_pages = bp->b_page_array; } else { bp->b_pages = kmem_alloc(sizeof(struct page *) * - page_count, xb_to_km(flags)); + page_count, KM_NOFS); if (bp->b_pages == NULL) return -ENOMEM; } @@ -288,11 +298,11 @@ xfs_buf_allocate_memory( xfs_buf_t *bp, uint flags) { - size_t size = bp->b_count_desired; + size_t size; size_t nbytes, offset; gfp_t gfp_mask = xb_to_gfp(flags); unsigned short page_count, i; - xfs_off_t end; + xfs_off_t start, end; int error; /* @@ -300,15 +310,15 @@ xfs_buf_allocate_memory( * the memory from the heap - there's no need for the complexity of * page arrays to keep allocation down to order 0. */ - if (bp->b_buffer_length < PAGE_SIZE) { - bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); + size = BBTOB(bp->b_length); + if (size < PAGE_SIZE) { + bp->b_addr = kmem_alloc(size, KM_NOFS); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; } - if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & - PAGE_MASK) != + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != ((unsigned long)bp->b_addr & PAGE_MASK)) { /* b_addr spans two pages - use alloc_page instead */ kmem_free(bp->b_addr); @@ -319,13 +329,14 @@ xfs_buf_allocate_memory( bp->b_pages = bp->b_page_array; bp->b_pages[0] = virt_to_page(bp->b_addr); bp->b_page_count = 1; - bp->b_flags |= XBF_MAPPED | _XBF_KMEM; + bp->b_flags |= _XBF_KMEM; return 0; } use_alloc_page: - end = bp->b_file_offset + bp->b_buffer_length; - page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); + start = BBTOB(bp->b_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; @@ -388,8 +399,9 @@ _xfs_buf_map_pages( if (bp->b_page_count == 1) { /* A single page buffer is always mappable */ bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; - bp->b_flags |= XBF_MAPPED; - } else if (flags & XBF_MAPPED) { + } else if (flags & XBF_UNMAPPED) { + bp->b_addr = NULL; + } else { int retried = 0; do { @@ -403,7 +415,6 @@ _xfs_buf_map_pages( if (!bp->b_addr) return -ENOMEM; bp->b_addr += bp->b_offset; - bp->b_flags |= XBF_MAPPED; } return 0; @@ -420,29 +431,27 @@ _xfs_buf_map_pages( */ xfs_buf_t * _xfs_buf_find( - xfs_buftarg_t *btp, /* block device target */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + struct xfs_buftarg *btp, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags, xfs_buf_t *new_bp) { - xfs_off_t range_base; - size_t range_length; + size_t numbytes; struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent; xfs_buf_t *bp; - range_base = (ioff << BBSHIFT); - range_length = (isize << BBSHIFT); + numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(range_length < (1 << btp->bt_sshift))); - ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < (1 << btp->bt_sshift))); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, ioff)); + xfs_daddr_to_agno(btp->bt_mount, blkno)); /* walk tree */ spin_lock(&pag->pag_buf_lock); @@ -453,20 +462,20 @@ _xfs_buf_find( parent = *rbp; bp = rb_entry(parent, struct xfs_buf, b_rbnode); - if (range_base < bp->b_file_offset) + if (blkno < bp->b_bn) rbp = &(*rbp)->rb_left; - else if (range_base > bp->b_file_offset) + else if (blkno > bp->b_bn) rbp = &(*rbp)->rb_right; else { /* - * found a block offset match. If the range doesn't + * found a block number match. If the range doesn't * match, the only way this is allowed is if the buffer * in the cache is stale and the transaction that made * it stale has not yet committed. i.e. we are * reallocating a busy extent. Skip this buffer and * continue searching to the right for an exact match. */ - if (bp->b_buffer_length != range_length) { + if (bp->b_length != numblks) { ASSERT(bp->b_flags & XBF_STALE); rbp = &(*rbp)->rb_right; continue; @@ -511,7 +520,7 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM | _XBF_PAGES; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -526,63 +535,59 @@ found: */ struct xfs_buf * xfs_buf_get( - xfs_buftarg_t *target,/* target for buffer */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + xfs_buftarg_t *target, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, ioff, isize, flags, NULL); + bp = _xfs_buf_find(target, blkno, numblks, flags, NULL); if (likely(bp)) goto found; - new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, - flags); + new_bp = xfs_buf_alloc(target, blkno, numblks, flags); if (unlikely(!new_bp)) return NULL; - bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); - if (!bp) { + error = xfs_buf_allocate_memory(new_bp, flags); + if (error) { kmem_zone_free(xfs_buf_zone, new_bp); return NULL; } - if (bp == new_bp) { - error = xfs_buf_allocate_memory(bp, flags); - if (error) - goto no_buffer; - } else - kmem_zone_free(xfs_buf_zone, new_bp); + bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp); + if (!bp) { + xfs_buf_free(new_bp); + return NULL; + } + + if (bp != new_bp) + xfs_buf_free(new_bp); /* * Now we have a workable buffer, fill in the block number so * that we can do IO on it. */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; + bp->b_bn = blkno; + bp->b_io_length = bp->b_length; found: - if (!(bp->b_flags & XBF_MAPPED)) { + if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } } XFS_STATS_INC(xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; - -no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } STATIC int @@ -590,32 +595,30 @@ _xfs_buf_read( xfs_buf_t *bp, xfs_buf_flags_t flags) { - int status; - - ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); + ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - status = xfs_buf_iorequest(bp); - if (status || bp->b_error || (flags & XBF_ASYNC)) - return status; + xfs_buf_iorequest(bp); + if (flags & XBF_ASYNC) + return 0; return xfs_buf_iowait(bp); } xfs_buf_t * xfs_buf_read( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { xfs_buf_t *bp; flags |= XBF_READ; - bp = xfs_buf_get(target, ioff, isize, flags); + bp = xfs_buf_get(target, blkno, numblks, flags); if (bp) { trace_xfs_buf_read(bp, flags, _RET_IP_); @@ -627,7 +630,8 @@ xfs_buf_read( * Read ahead call which is already satisfied, * drop the buffer */ - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } else { /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; @@ -635,12 +639,6 @@ xfs_buf_read( } return bp; - - no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } /* @@ -650,14 +648,14 @@ xfs_buf_read( void xfs_buf_readahead( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize) + xfs_daddr_t blkno, + size_t numblks) { if (bdi_read_congested(target->bt_bdi)) return; - xfs_buf_read(target, ioff, isize, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); + xfs_buf_read(target, blkno, numblks, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); } /* @@ -666,16 +664,15 @@ xfs_buf_readahead( */ struct xfs_buf * xfs_buf_read_uncached( - struct xfs_mount *mp, struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t length, + size_t numblks, int flags) { xfs_buf_t *bp; int error; - bp = xfs_buf_get_uncached(target, length, flags); + bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) return NULL; @@ -683,9 +680,9 @@ xfs_buf_read_uncached( XFS_BUF_SET_ADDR(bp, daddr); XFS_BUF_READ(bp); - xfsbdstrat(mp, bp); + xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); - if (error || bp->b_error) { + if (error) { xfs_buf_relse(bp); return NULL; } @@ -699,7 +696,7 @@ xfs_buf_read_uncached( void xfs_buf_set_empty( struct xfs_buf *bp, - size_t len) + size_t numblks) { if (bp->b_pages) _xfs_buf_free_pages(bp); @@ -707,10 +704,9 @@ xfs_buf_set_empty( bp->b_pages = NULL; bp->b_page_count = 0; bp->b_addr = NULL; - bp->b_file_offset = 0; - bp->b_buffer_length = bp->b_count_desired = len; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_bn = XFS_BUF_DADDR_NULL; - bp->b_flags &= ~XBF_MAPPED; } static inline struct page * @@ -749,7 +745,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); + rval = _xfs_buf_get_pages(bp, page_count, 0); if (rval) return rval; @@ -760,9 +756,8 @@ xfs_buf_associate_memory( pageaddr += PAGE_SIZE; } - bp->b_count_desired = len; - bp->b_buffer_length = buflen; - bp->b_flags |= XBF_MAPPED; + bp->b_io_length = BTOBB(len); + bp->b_length = BTOBB(buflen); return 0; } @@ -770,17 +765,18 @@ xfs_buf_associate_memory( xfs_buf_t * xfs_buf_get_uncached( struct xfs_buftarg *target, - size_t len, + size_t numblks, int flags) { - unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; + unsigned long page_count; int error, i; xfs_buf_t *bp; - bp = xfs_buf_alloc(target, 0, len, 0); + bp = xfs_buf_alloc(target, 0, numblks, 0); if (unlikely(bp == NULL)) goto fail; + page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; error = _xfs_buf_get_pages(bp, page_count, 0); if (error) goto fail_free_buf; @@ -792,7 +788,7 @@ xfs_buf_get_uncached( } bp->b_flags |= _XBF_PAGES; - error = _xfs_buf_map_pages(bp, XBF_MAPPED); + error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); @@ -855,7 +851,7 @@ xfs_buf_rele( spin_unlock(&pag->pag_buf_lock); } else { xfs_buf_lru_del(bp); - ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); @@ -915,13 +911,6 @@ xfs_buf_lock( trace_xfs_buf_lock_done(bp, _RET_IP_); } -/* - * Releases the lock on the buffer object. - * If the buffer is marked delwri but is not queued, do so before we - * unlock the buffer as we need to set flags correctly. We also need to - * take a reference for the delwri queue because the unlocker is going to - * drop their's and they don't know we just queued it. - */ void xfs_buf_unlock( struct xfs_buf *bp) @@ -1008,9 +997,8 @@ xfs_buf_ioerror_alert( const char *func) { xfs_alert(bp->b_target->bt_mount, -"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", - (__uint64_t)XFS_BUF_ADDR(bp), func, - bp->b_error, XFS_BUF_COUNT(bp)); +"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", + (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); } int @@ -1019,10 +1007,11 @@ xfs_bwrite( { int error; + ASSERT(xfs_buf_islocked(bp)); + bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); - xfs_buf_delwri_dequeue(bp); xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); @@ -1181,7 +1170,7 @@ _xfs_buf_ioapply( int rw, map_i, total_nr_pages, nr_pages; struct bio *bio; int offset = bp->b_offset; - int size = bp->b_count_desired; + int size = BBTOB(bp->b_io_length); sector_t sector = bp->b_bn; total_nr_pages = bp->b_page_count; @@ -1229,7 +1218,7 @@ next_chunk: break; offset = 0; - sector += nbytes >> BBSHIFT; + sector += BTOBB(nbytes); size -= nbytes; total_nr_pages--; } @@ -1248,13 +1237,13 @@ next_chunk: } } -int +void xfs_buf_iorequest( xfs_buf_t *bp) { trace_xfs_buf_iorequest(bp, _RET_IP_); - ASSERT(!(bp->b_flags & XBF_DELWRI)); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); @@ -1269,13 +1258,12 @@ xfs_buf_iorequest( _xfs_buf_ioend(bp, 0); xfs_buf_rele(bp); - return 0; } /* - * Waits for I/O to complete on the buffer supplied. - * It returns immediately if no I/O is pending. - * It returns the I/O error code, if any, or 0 if there was no error. + * Waits for I/O to complete on the buffer supplied. It returns immediately if + * no I/O is pending or there is already a pending error on the buffer. It + * returns the I/O error code, if any, or 0 if there was no error. */ int xfs_buf_iowait( @@ -1283,7 +1271,8 @@ xfs_buf_iowait( { trace_xfs_buf_iowait(bp, _RET_IP_); - wait_for_completion(&bp->b_iowait); + if (!bp->b_error) + wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); return bp->b_error; @@ -1296,7 +1285,7 @@ xfs_buf_offset( { struct page *page; - if (bp->b_flags & XBF_MAPPED) + if (bp->b_addr) return bp->b_addr + offset; offset += bp->b_offset; @@ -1315,27 +1304,30 @@ xfs_buf_iomove( void *data, /* data address */ xfs_buf_rw_t mode) /* read/write/zero flag */ { - size_t bend, cpoff, csize; - struct page *page; + size_t bend; bend = boff + bsize; while (boff < bend) { - page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; - cpoff = xfs_buf_poff(boff + bp->b_offset); - csize = min_t(size_t, - PAGE_SIZE-cpoff, bp->b_count_desired-boff); + struct page *page; + int page_index, page_offset, csize; + + page_index = (boff + bp->b_offset) >> PAGE_SHIFT; + page_offset = (boff + bp->b_offset) & ~PAGE_MASK; + page = bp->b_pages[page_index]; + csize = min_t(size_t, PAGE_SIZE - page_offset, + BBTOB(bp->b_io_length) - boff); - ASSERT(((csize + cpoff) <= PAGE_SIZE)); + ASSERT((csize + page_offset) <= PAGE_SIZE); switch (mode) { case XBRW_ZERO: - memset(page_address(page) + cpoff, 0, csize); + memset(page_address(page) + page_offset, 0, csize); break; case XBRW_READ: - memcpy(data, page_address(page) + cpoff, csize); + memcpy(data, page_address(page) + page_offset, csize); break; case XBRW_WRITE: - memcpy(page_address(page) + cpoff, data, csize); + memcpy(page_address(page) + page_offset, data, csize); } boff += csize; @@ -1435,11 +1427,9 @@ xfs_free_buftarg( { unregister_shrinker(&btp->bt_shrinker); - xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - kthread_stop(btp->bt_task); kmem_free(btp); } @@ -1491,20 +1481,6 @@ xfs_setsize_buftarg( return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); } -STATIC int -xfs_alloc_delwri_queue( - xfs_buftarg_t *btp, - const char *fsname) -{ - INIT_LIST_HEAD(&btp->bt_delwri_queue); - spin_lock_init(&btp->bt_delwri_lock); - btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); - if (IS_ERR(btp->bt_task)) - return PTR_ERR(btp->bt_task); - return 0; -} - xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, @@ -1527,8 +1503,6 @@ xfs_alloc_buftarg( spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_alloc_delwri_queue(btp, fsname)) - goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; btp->bt_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&btp->bt_shrinker); @@ -1539,125 +1513,52 @@ error: return NULL; } - /* - * Delayed write buffer handling + * Add a buffer to the delayed write list. + * + * This queues a buffer for writeout if it hasn't already been. Note that + * neither this routine nor the buffer list submission functions perform + * any internal synchronization. It is expected that the lists are thread-local + * to the callers. + * + * Returns true if we queued up the buffer, or false if it already had + * been on the buffer list. */ -void +bool xfs_buf_delwri_queue( - xfs_buf_t *bp) + struct xfs_buf *bp, + struct list_head *list) { - struct xfs_buftarg *btp = bp->b_target; - - trace_xfs_buf_delwri_queue(bp, _RET_IP_); - + ASSERT(xfs_buf_islocked(bp)); ASSERT(!(bp->b_flags & XBF_READ)); - spin_lock(&btp->bt_delwri_lock); - if (!list_empty(&bp->b_list)) { - /* if already in the queue, move it to the tail */ - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_move_tail(&bp->b_list, &btp->bt_delwri_queue); - } else { - /* start xfsbufd as it is about to have something to do */ - if (list_empty(&btp->bt_delwri_queue)) - wake_up_process(bp->b_target->bt_task); - - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; - list_add_tail(&bp->b_list, &btp->bt_delwri_queue); - } - bp->b_queuetime = jiffies; - spin_unlock(&btp->bt_delwri_lock); -} - -void -xfs_buf_delwri_dequeue( - xfs_buf_t *bp) -{ - int dequeued = 0; - - spin_lock(&bp->b_target->bt_delwri_lock); - if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_del_init(&bp->b_list); - dequeued = 1; + /* + * If the buffer is already marked delwri it already is queued up + * by someone else for imediate writeout. Just ignore it in that + * case. + */ + if (bp->b_flags & _XBF_DELWRI_Q) { + trace_xfs_buf_delwri_queued(bp, _RET_IP_); + return false; } - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(&bp->b_target->bt_delwri_lock); - - if (dequeued) - xfs_buf_rele(bp); - - trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); -} - -/* - * If a delwri buffer needs to be pushed before it has aged out, then promote - * it to the head of the delwri queue so that it will be flushed on the next - * xfsbufd run. We do this by resetting the queuetime of the buffer to be older - * than the age currently needed to flush the buffer. Hence the next time the - * xfsbufd sees it is guaranteed to be considered old enough to flush. - */ -void -xfs_buf_delwri_promote( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; - ASSERT(bp->b_flags & XBF_DELWRI); - ASSERT(bp->b_flags & _XBF_DELWRI_Q); + trace_xfs_buf_delwri_queue(bp, _RET_IP_); /* - * Check the buffer age before locking the delayed write queue as we - * don't need to promote buffers that are already past the flush age. + * If a buffer gets written out synchronously or marked stale while it + * is on a delwri list we lazily remove it. To do this, the other party + * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. + * It remains referenced and on the list. In a rare corner case it + * might get readded to a delwri list after the synchronous writeout, in + * which case we need just need to re-add the flag here. */ - if (bp->b_queuetime < jiffies - age) - return; - bp->b_queuetime = jiffies - age; - spin_lock(&btp->bt_delwri_lock); - list_move(&bp->b_list, &btp->bt_delwri_queue); - spin_unlock(&btp->bt_delwri_lock); -} - -/* - * Move as many buffers as specified to the supplied list - * idicating if we skipped any buffers to prevent deadlocks. - */ -STATIC int -xfs_buf_delwri_split( - xfs_buftarg_t *target, - struct list_head *list, - unsigned long age) -{ - xfs_buf_t *bp, *n; - int skipped = 0; - int force; - - force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); - INIT_LIST_HEAD(list); - spin_lock(&target->bt_delwri_lock); - list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { - ASSERT(bp->b_flags & XBF_DELWRI); - - if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { - if (!force && - time_before(jiffies, bp->b_queuetime + age)) { - xfs_buf_unlock(bp); - break; - } - - bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q); - bp->b_flags |= XBF_WRITE; - list_move_tail(&bp->b_list, list); - trace_xfs_buf_delwri_split(bp, _RET_IP_); - } else - skipped++; + bp->b_flags |= _XBF_DELWRI_Q; + if (list_empty(&bp->b_list)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_list, list); } - spin_unlock(&target->bt_delwri_lock); - return skipped; + return true; } /* @@ -1683,99 +1584,109 @@ xfs_buf_cmp( return 0; } -STATIC int -xfsbufd( - void *data) +static int +__xfs_buf_delwri_submit( + struct list_head *buffer_list, + struct list_head *io_list, + bool wait) { - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - - current->flags |= PF_MEMALLOC; - - set_freezable(); + struct blk_plug plug; + struct xfs_buf *bp, *n; + int pinned = 0; + + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + if (!wait) { + if (xfs_buf_ispinned(bp)) { + pinned++; + continue; + } + if (!xfs_buf_trylock(bp)) + continue; + } else { + xfs_buf_lock(bp); + } - do { - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); - long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); - struct list_head tmp; - struct blk_plug plug; + /* + * Someone else might have written the buffer synchronously or + * marked it stale in the meantime. In that case only the + * _XBF_DELWRI_Q flag got cleared, and we have to drop the + * reference and remove it from the list here. + */ + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + continue; + } - if (unlikely(freezing(current))) - try_to_freeze(); + list_move_tail(&bp->b_list, io_list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); + } - /* sleep for a long time if there is nothing to do. */ - if (list_empty(&target->bt_delwri_queue)) - tout = MAX_SCHEDULE_TIMEOUT; - schedule_timeout_interruptible(tout); + list_sort(NULL, io_list, xfs_buf_cmp); - xfs_buf_delwri_split(target, &tmp, age); - list_sort(NULL, &tmp, xfs_buf_cmp); + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, io_list, b_list) { + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags |= XBF_WRITE; - blk_start_plug(&plug); - while (!list_empty(&tmp)) { - struct xfs_buf *bp; - bp = list_first_entry(&tmp, struct xfs_buf, b_list); + if (!wait) { + bp->b_flags |= XBF_ASYNC; list_del_init(&bp->b_list); - xfs_bdstrat_cb(bp); } - blk_finish_plug(&plug); - } while (!kthread_should_stop()); + xfs_bdstrat_cb(bp); + } + blk_finish_plug(&plug); - return 0; + return pinned; } /* - * Go through all incore buffers, and release buffers if they belong to - * the given device. This is used in filesystem error handling to - * preserve the consistency of its metadata. + * Write out a buffer list asynchronously. + * + * This will take the @buffer_list, write all non-locked and non-pinned buffers + * out and not wait for I/O completion on any of the buffers. This interface + * is only safely useable for callers that can track I/O completion by higher + * level means, e.g. AIL pushing as the @buffer_list is consumed in this + * function. */ int -xfs_flush_buftarg( - xfs_buftarg_t *target, - int wait) +xfs_buf_delwri_submit_nowait( + struct list_head *buffer_list) { - xfs_buf_t *bp; - int pincount = 0; - LIST_HEAD(tmp_list); - LIST_HEAD(wait_list); - struct blk_plug plug; + LIST_HEAD (io_list); + return __xfs_buf_delwri_submit(buffer_list, &io_list, false); +} - flush_workqueue(xfslogd_workqueue); +/* + * Write out a buffer list synchronously. + * + * This will take the @buffer_list, write all buffers out and wait for I/O + * completion on all of the buffers. @buffer_list is consumed by the function, + * so callers must have some other way of tracking buffers if they require such + * functionality. + */ +int +xfs_buf_delwri_submit( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + int error = 0, error2; + struct xfs_buf *bp; - set_bit(XBT_FORCE_FLUSH, &target->bt_flags); - pincount = xfs_buf_delwri_split(target, &tmp_list, 0); + __xfs_buf_delwri_submit(buffer_list, &io_list, true); - /* - * Dropped the delayed write list lock, now walk the temporary list. - * All I/O is issued async and then if we need to wait for completion - * we do that after issuing all the IO. - */ - list_sort(NULL, &tmp_list, xfs_buf_cmp); + /* Wait for IO to complete. */ + while (!list_empty(&io_list)) { + bp = list_first_entry(&io_list, struct xfs_buf, b_list); - blk_start_plug(&plug); - while (!list_empty(&tmp_list)) { - bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); - ASSERT(target == bp->b_target); list_del_init(&bp->b_list); - if (wait) { - bp->b_flags &= ~XBF_ASYNC; - list_add(&bp->b_list, &wait_list); - } - xfs_bdstrat_cb(bp); - } - blk_finish_plug(&plug); - - if (wait) { - /* Wait for IO to complete. */ - while (!list_empty(&wait_list)) { - bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - - list_del_init(&bp->b_list); - xfs_buf_iowait(bp); - xfs_buf_relse(bp); - } + error2 = xfs_buf_iowait(bp); + xfs_buf_relse(bp); + if (!error) + error = error2; } - return pincount; + return error; } int __init diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 5bf3be45f54..7f1d1392ce3 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -32,11 +32,6 @@ #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) -#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) -#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) -#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) - typedef enum { XBRW_READ = 1, /* transfer into target memory */ XBRW_WRITE = 2, /* transfer from target memory */ @@ -46,11 +41,9 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ -#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ /* I/O hints for the BIO layer */ #define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ @@ -58,14 +51,13 @@ typedef enum { #define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ /* flags used only as arguments to access routines */ -#define XBF_LOCK (1 << 15)/* lock requested */ #define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ /* flags used only internally */ #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ typedef unsigned int xfs_buf_flags_t; @@ -73,25 +65,18 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ - { XBF_MAPPED, "MAPPED" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ - { XBF_DELWRI, "DELWRI" }, \ { XBF_STALE, "STALE" }, \ { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ - { XBF_LOCK, "LOCK" }, /* should never be set */\ - { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ - { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" } -typedef enum { - XBT_FORCE_FLUSH = 0, -} xfs_buftarg_flags_t; - typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; @@ -101,12 +86,6 @@ typedef struct xfs_buftarg { unsigned int bt_sshift; size_t bt_smask; - /* per device delwri queue */ - struct task_struct *bt_task; - struct list_head bt_delwri_queue; - spinlock_t bt_delwri_lock; - unsigned long bt_flags; - /* LRU control structures */ struct shrinker bt_shrinker; struct list_head bt_lru; @@ -128,8 +107,8 @@ typedef struct xfs_buf { * fast-path on locking. */ struct rb_node b_rbnode; /* rbtree node */ - xfs_off_t b_file_offset; /* offset in file */ - size_t b_buffer_length;/* size of buffer in bytes */ + xfs_daddr_t b_bn; /* block number for I/O */ + int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ @@ -140,8 +119,6 @@ typedef struct xfs_buf { struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ - xfs_daddr_t b_bn; /* block number for I/O */ - size_t b_count_desired;/* desired transfer size */ void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; xfs_buf_iodone_t b_iodone; /* I/O completion function */ @@ -150,7 +127,7 @@ typedef struct xfs_buf { struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ - unsigned long b_queuetime; /* time buffer was queued */ + int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ @@ -163,26 +140,30 @@ typedef struct xfs_buf { /* Finding and Reading Buffers */ -extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t, xfs_buf_t *); +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags, + struct xfs_buf *new_bp); #define xfs_incore(buftarg,blkno,len,lockit) \ _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) -extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); -extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); - -struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t, - xfs_buf_flags_t); -extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); -extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); -extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); -extern void xfs_buf_hold(xfs_buf_t *); -extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t); -struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp, - struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t length, int flags); +struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks); + +struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); +struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); +int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); + +struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + int flags); +struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, + xfs_daddr_t daddr, size_t numblks, int flags); +void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ extern void xfs_buf_free(xfs_buf_t *); @@ -204,7 +185,7 @@ extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); -extern int xfs_buf_iorequest(xfs_buf_t *); +extern void xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); @@ -220,24 +201,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); /* Delayed Write Buffer Routines */ -extern void xfs_buf_delwri_queue(struct xfs_buf *); -extern void xfs_buf_delwri_dequeue(struct xfs_buf *); -extern void xfs_buf_delwri_promote(struct xfs_buf *); +extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +extern int xfs_buf_delwri_submit(struct list_head *); +extern int xfs_buf_delwri_submit_nowait(struct list_head *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) \ - ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ + ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) -#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) - #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) @@ -256,12 +235,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_ADDR(bp) ((bp)->b_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) -#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) -#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off)) -#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired) -#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt)) -#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) -#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { @@ -287,7 +260,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); -extern int xfs_flush_buftarg(xfs_buftarg_t *, int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index eac97ef81e2..45df2b857d4 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -123,11 +122,11 @@ xfs_buf_item_log_check( ASSERT(bip->bli_logged != NULL); bp = bip->bli_buf; - ASSERT(XFS_BUF_COUNT(bp) > 0); + ASSERT(bp->b_length > 0); ASSERT(bp->b_addr != NULL); orig = bip->bli_orig; buffer = bp->b_addr; - for (x = 0; x < XFS_BUF_COUNT(bp); x++) { + for (x = 0; x < BBTOB(bp->b_length); x++) { if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { xfs_emerg(bp->b_mount, "%s: bip %x buffer %x orig %x index %d", @@ -418,7 +417,6 @@ xfs_buf_item_unpin( if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); @@ -455,42 +453,42 @@ xfs_buf_item_unpin( bp->b_iodone = NULL; } else { spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_fspriv == NULL); } xfs_buf_relse(bp); + } else if (freed && remove) { + xfs_buf_lock(bp); + xfs_buf_ioerror(bp, EIO); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioend(bp, 0); } } -/* - * This is called to attempt to lock the buffer associated with this - * buf log item. Don't sleep on the buffer lock. If we can't get - * the lock right away, return 0. If we can get the lock, take a - * reference to the buffer. If this is a delayed write buffer that - * needs AIL help to be written back, invoke the pushbuf routine - * rather than the normal success path. - */ STATIC uint -xfs_buf_item_trylock( - struct xfs_log_item *lip) +xfs_buf_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + uint rval = XFS_ITEM_SUCCESS; if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - /* take a reference to the buffer. */ - xfs_buf_hold(bp); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - trace_xfs_buf_item_trylock(bip); - if (XFS_BUF_ISDELAYWRITE(bp)) - return XFS_ITEM_PUSHBUF; - return XFS_ITEM_SUCCESS; + + trace_xfs_buf_item_push(bip); + + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_unlock(bp); + return rval; } /* @@ -603,49 +601,6 @@ xfs_buf_item_committed( return lsn; } -/* - * The buffer is locked, but is not a delayed write buffer. This happens - * if we race with IO completion and hence we don't want to try to write it - * again. Just release the buffer. - */ -STATIC void -xfs_buf_item_push( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_push(bip); - - xfs_buf_relse(bp); -} - -/* - * The buffer is locked and is a delayed write buffer. Promote the buffer - * in the delayed write queue as the caller knows that they must invoke - * the xfsbufd to get this buffer written. We have to unlock the buffer - * to allow the xfsbufd to write it, too. - */ -STATIC bool -xfs_buf_item_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_pushbuf(bip); - - xfs_buf_delwri_promote(bp); - xfs_buf_relse(bp); - return true; -} - STATIC void xfs_buf_item_committing( struct xfs_log_item *lip, @@ -661,11 +616,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, .iop_unpin = xfs_buf_item_unpin, - .iop_trylock = xfs_buf_item_trylock, .iop_unlock = xfs_buf_item_unlock, .iop_committed = xfs_buf_item_committed, .iop_push = xfs_buf_item_push, - .iop_pushbuf = xfs_buf_item_pushbuf, .iop_committing = xfs_buf_item_committing }; @@ -703,7 +656,8 @@ xfs_buf_item_init( * truncate any pieces. map_size is the size of the * bitmap needed to describe the chunks of the buffer. */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); + chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >> + XFS_BLF_SHIFT); map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, @@ -713,7 +667,7 @@ xfs_buf_item_init( xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); - bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); + bip->bli_format.blf_len = (ushort)bp->b_length; bip->bli_format.blf_map_size = map_size; #ifdef XFS_TRANS_DEBUG @@ -725,9 +679,9 @@ xfs_buf_item_init( * the buffer to indicate which bytes the callers have asked * to have logged. */ - bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); - memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp)); - bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); + bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP); + memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length)); + bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP); #endif /* @@ -984,20 +938,27 @@ xfs_buf_iodone_callbacks( * If the write was asynchronous then no one will be looking for the * error. Clear the error state and write the buffer out again. * - * During sync or umount we'll write all pending buffers again - * synchronous, which will catch these errors if they keep hanging - * around. + * XXX: This helps against transient write errors, but we need to find + * a way to shut the filesystem down if the writes keep failing. + * + * In practice we'll shut the filesystem down soon as non-transient + * erorrs tend to affect the whole device and a failing log write + * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { + ASSERT(bp->b_iodone != NULL); + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { - xfs_buf_delwri_queue(bp); - XFS_BUF_DONE(bp); + bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; + xfs_bdstrat_cb(bp); + } else { + xfs_buf_relse(bp); } - ASSERT(bp->b_iodone != NULL); - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); + return; } @@ -1045,6 +1006,6 @@ xfs_buf_iodone( * Either way, AIL is useless if we're forcing a shutdown. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 7f1a6f5b05a..015b946c580 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -2277,20 +2276,20 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) if (nbuf == 1) { dabuf->nbuf = 1; bp = bps[0]; - dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); + dabuf->bbcount = bp->b_length; dabuf->data = bp->b_addr; dabuf->bps[0] = bp; } else { dabuf->nbuf = nbuf; for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { dabuf->bps[i] = bp = bps[i]; - dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp)); + dabuf->bbcount += bp->b_length; } dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); - for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { + for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) { bp = bps[i]; memcpy((char *)dabuf->data + off, bp->b_addr, - XFS_BUF_COUNT(bp)); + BBTOB(bp->b_length)); } } return dabuf; @@ -2310,10 +2309,10 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf) ASSERT(dabuf->nbuf > 1); dabuf->dirty = 0; for (i = off = 0; i < dabuf->nbuf; - i++, off += XFS_BUF_COUNT(bp)) { + i++, off += BBTOB(bp->b_length)) { bp = dabuf->bps[i]; memcpy(bp->b_addr, dabuf->data + off, - XFS_BUF_COUNT(bp)); + BBTOB(bp->b_length)); } } } @@ -2356,10 +2355,10 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) } dabuf->dirty = 1; ASSERT(first <= last); - for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { + for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) { bp = dabuf->bps[i]; f = off; - l = f + XFS_BUF_COUNT(bp) - 1; + l = f + BBTOB(bp->b_length) - 1; if (f < first) f = first; if (l > last) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 1137bbc5ecc..e00de08dc8a 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index a2e27010c7f..67a250c36d4 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index d3b63aefd01..586732f2d80 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 5bbe2a8a023..2046988e9eb 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 66e108f561a..397ffbcbab1 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 0179a41d9e5..b0f26780449 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 79d05e84e29..19bf0c5e38f 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 1ad3a4b8ca4..f9c3fe304a1 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -30,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_extent_busy.h" #include "xfs_discard.h" #include "xfs_trace.h" @@ -118,7 +118,7 @@ xfs_trim_extents( * If any blocks in the range are still busy, skip the * discard and try again the next time. */ - if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + if (xfs_extent_busy_search(mp, agno, fbno, flen)) { trace_xfs_discard_busy(mp, agno, fbno, flen); goto next_extent; } @@ -212,7 +212,7 @@ xfs_discard_extents( struct xfs_mount *mp, struct list_head *list) { - struct xfs_busy_extent *busyp; + struct xfs_extent_busy *busyp; int error = 0; list_for_each_entry(busyp, list, list) { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1155208fa83..bf27fcca484 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -857,7 +856,7 @@ xfs_qm_dqflush_done( /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); if (lip->li_lsn == qip->qli_flush_lsn) - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); else spin_unlock(&ailp->xa_lock); } @@ -878,8 +877,8 @@ xfs_qm_dqflush_done( */ int xfs_qm_dqflush( - xfs_dquot_t *dqp, - uint flags) + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; @@ -891,25 +890,30 @@ xfs_qm_dqflush( trace_xfs_dqflush(dqp); - /* - * If not dirty, or it's pinned and we are not supposed to block, nada. - */ - if (!XFS_DQ_IS_DIRTY(dqp) || - ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) { - xfs_dqfunlock(dqp); - return 0; - } + *bpp = NULL; + xfs_qm_dqunpin_wait(dqp); /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this dquot - * to disk, because the log record didn't make it to disk! + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an emptry AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; - xfs_dqfunlock(dqp); - return XFS_ERROR(EIO); + + spin_lock(&mp->m_ail->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(mp->m_ail, lip, + SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&mp->m_ail->xa_lock); + error = XFS_ERROR(EIO); + goto out_unlock; } /* @@ -917,11 +921,8 @@ xfs_qm_dqflush( */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); - if (error) { - ASSERT(error != ENOENT); - xfs_dqfunlock(dqp); - return error; - } + if (error) + goto out_unlock; /* * Calculate the location of the dquot inside the buffer. @@ -967,20 +968,13 @@ xfs_qm_dqflush( xfs_log_force(mp, 0); } - if (flags & SYNC_WAIT) - error = xfs_bwrite(bp); - else - xfs_buf_delwri_queue(bp); - - xfs_buf_relse(bp); - trace_xfs_dqflush_done(dqp); + *bpp = bp; + return 0; - /* - * dqp is still locked, but caller is free to unlock it now. - */ - return error; - +out_unlock: + xfs_dqfunlock(dqp); + return XFS_ERROR(EIO); } /* @@ -1011,39 +1005,6 @@ xfs_dqlock2( } } -/* - * Give the buffer a little push if it is incore and - * wait on the flush lock. - */ -void -xfs_dqflock_pushbuf_wait( - xfs_dquot_t *dqp) -{ - xfs_mount_t *mp = dqp->q_mount; - xfs_buf_t *bp; - - /* - * Check to see if the dquot has been flushed delayed - * write. If so, grab its buffer and send it - * out immediately. We'll be able to acquire - * the flush lock when the I/O completes. - */ - bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); - if (!bp) - goto out_lock; - - if (XFS_BUF_ISDELAYWRITE(bp)) { - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - xfs_buf_delwri_promote(bp); - wake_up_process(bp->b_target->bt_task); - } - xfs_buf_relse(bp); -out_lock: - xfs_dqflock(dqp); -} - int __init xfs_qm_init(void) { diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index ef9190bd8b3..7d20af27346 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -141,7 +141,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); -extern int xfs_qm_dqflush(xfs_dquot_t *, uint); +extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); @@ -152,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); -extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 34baeae4526..57aa4b03720 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -108,38 +106,6 @@ xfs_qm_dquot_logitem_unpin( wake_up(&dqp->q_pinwait); } -/* - * Given the logitem, this writes the corresponding dquot entry to disk - * asynchronously. This is called with the dquot entry securely locked; - * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot - * at the end. - */ -STATIC void -xfs_qm_dquot_logitem_push( - struct xfs_log_item *lip) -{ - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - int error; - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(!completion_done(&dqp->q_flush)); - - /* - * Since we were able to lock the dquot's flush lock and - * we found it on the AIL, the dquot must be dirty. This - * is because the dquot is removed from the AIL while still - * holding the flush lock in xfs_dqflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the dquot. - */ - error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); - if (error) - xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", - __func__, error, dqp); - xfs_dqunlock(dqp); -} - STATIC xfs_lsn_t xfs_qm_dquot_logitem_committed( struct xfs_log_item *lip, @@ -171,67 +137,15 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } -/* - * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that - * the dquot is locked by us, but the flush lock isn't. So, here we are - * going to see if the relevant dquot buffer is incore, waiting on DELWRI. - * If so, we want to push it out to help us take this item off the AIL as soon - * as possible. - * - * We must not be holding the AIL lock at this point. Calling incore() to - * search the buffer cache can be a time consuming thing, and AIL lock is a - * spinlock. - */ -STATIC bool -xfs_qm_dquot_logitem_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - struct xfs_dquot *dqp = qlip->qli_dquot; - struct xfs_buf *bp; - bool ret = true; - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - - /* - * If flushlock isn't locked anymore, chances are that the - * inode flush completed and the inode was taken off the AIL. - * So, just get out. - */ - if (completion_done(&dqp->q_flush) || - !(lip->li_flags & XFS_LI_IN_AIL)) { - xfs_dqunlock(dqp); - return true; - } - - bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, - dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); - xfs_dqunlock(dqp); - if (!bp) - return true; - if (XFS_BUF_ISDELAYWRITE(bp)) - xfs_buf_delwri_promote(bp); - if (xfs_buf_ispinned(bp)) - ret = false; - xfs_buf_relse(bp); - return ret; -} - -/* - * This is called to attempt to lock the dquot associated with this - * dquot log item. Don't sleep on the dquot lock or the flush lock. - * If the flush lock is already held, indicating that the dquot has - * been or is in the process of being flushed, then see if we can - * find the dquot's buffer in the buffer cache without sleeping. If - * we can and it is marked delayed write, then we want to send it out. - * We delay doing so until the push routine, though, to avoid sleeping - * in any device strategy routines. - */ STATIC uint -xfs_qm_dquot_logitem_trylock( - struct xfs_log_item *lip) +xfs_qm_dquot_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; @@ -239,16 +153,41 @@ xfs_qm_dquot_logitem_trylock( if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; + /* + * Re-check the pincount now that we stabilized the value by + * taking the quota lock. + */ + if (atomic_read(&dqp->q_pincount) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the dquot. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ if (!xfs_dqflock_nowait(dqp)) { - /* - * dquot has already been flushed to the backing buffer, - * leave it locked, pushbuf routine will unlock it. - */ - return XFS_ITEM_PUSHBUF; + rval = XFS_ITEM_FLUSHING; + goto out_unlock; } - ASSERT(lip->li_flags & XFS_LI_IN_AIL); - return XFS_ITEM_SUCCESS; + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + __func__, error, dqp); + } else { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_dqunlock(dqp); + return rval; } /* @@ -299,11 +238,9 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, .iop_unpin = xfs_qm_dquot_logitem_unpin, - .iop_trylock = xfs_qm_dquot_logitem_trylock, .iop_unlock = xfs_qm_dquot_logitem_unlock, .iop_committed = xfs_qm_dquot_logitem_committed, .iop_push = xfs_qm_dquot_logitem_push, - .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf, .iop_committing = xfs_qm_dquot_logitem_committing }; @@ -398,11 +335,13 @@ xfs_qm_qoff_logitem_unpin( } /* - * Quotaoff items have no locking, so just return success. + * There isn't much you can do to push a quotaoff item. It is simply + * stuck waiting for the log to be flushed to disk. */ STATIC uint -xfs_qm_qoff_logitem_trylock( - struct xfs_log_item *lip) +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { return XFS_ITEM_LOCKED; } @@ -429,17 +368,6 @@ xfs_qm_qoff_logitem_committed( return lsn; } -/* - * There isn't much you can do to push on an quotaoff item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC void -xfs_qm_qoff_logitem_push( - struct xfs_log_item *lip) -{ -} - - STATIC xfs_lsn_t xfs_qm_qoffend_logitem_committed( struct xfs_log_item *lip, @@ -454,7 +382,7 @@ xfs_qm_qoffend_logitem_committed( * xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); + xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); kmem_free(qfs); kmem_free(qfe); @@ -487,7 +415,6 @@ static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_trylock = xfs_qm_qoff_logitem_trylock, .iop_unlock = xfs_qm_qoff_logitem_unlock, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, @@ -502,7 +429,6 @@ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_trylock = xfs_qm_qoff_logitem_trylock, .iop_unlock = xfs_qm_qoff_logitem_unlock, .iop_committed = xfs_qm_qoff_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 39f06336b99..610456054dc 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 558910f5e3c..42679223a0f 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_types.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -53,19 +52,18 @@ static int xfs_fileid_length(int fileid_type) STATIC int xfs_fs_encode_fh( - struct dentry *dentry, - __u32 *fh, - int *max_len, - int connectable) + struct inode *inode, + __u32 *fh, + int *max_len, + struct inode *parent) { struct fid *fid = (struct fid *)fh; struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; - struct inode *inode = dentry->d_inode; int fileid_type; int len; /* Directories don't need their parent encoded, they have ".." */ - if (S_ISDIR(inode->i_mode) || !connectable) + if (!parent) fileid_type = FILEID_INO32_GEN; else fileid_type = FILEID_INO32_GEN_PARENT; @@ -97,20 +95,16 @@ xfs_fs_encode_fh( switch (fileid_type) { case FILEID_INO32_GEN_PARENT: - spin_lock(&dentry->d_lock); - fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; - fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; - spin_unlock(&dentry->d_lock); + fid->i32.parent_ino = XFS_I(parent)->i_ino; + fid->i32.parent_gen = parent->i_generation; /*FALLTHRU*/ case FILEID_INO32_GEN: fid->i32.ino = XFS_I(inode)->i_ino; fid->i32.gen = inode->i_generation; break; case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: - spin_lock(&dentry->d_lock); - fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; - fid64->parent_gen = dentry->d_parent->d_inode->i_generation; - spin_unlock(&dentry->d_lock); + fid64->parent_ino = XFS_I(parent)->i_ino; + fid64->parent_gen = parent->i_generation; /*FALLTHRU*/ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: fid64->ino = XFS_I(inode)->i_ino; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c new file mode 100644 index 00000000000..85e9f87a1a7 --- /dev/null +++ b/fs/xfs/xfs_extent_busy.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_inode.h" +#include "xfs_extent_busy.h" +#include "xfs_trace.h" + +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + struct xfs_extent_busy *new; + struct xfs_extent_busy *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent = NULL; + + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); + xfs_trans_set_sync(tp); + return; + } + + new->agno = agno; + new->bno = bno; + new->length = len; + INIT_LIST_HEAD(&new->list); + new->flags = flags; + + /* trace before insert to be able to see failed inserts */ + trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); + + pag = xfs_perag_get(tp->t_mountp, new->agno); + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + while (*rbp) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); + + if (new->bno < busyp->bno) { + rbp = &(*rbp)->rb_left; + ASSERT(new->bno + new->length <= busyp->bno); + } else if (new->bno > busyp->bno) { + rbp = &(*rbp)->rb_right; + ASSERT(bno >= busyp->bno + busyp->length); + } else { + ASSERT(0); + } + } + + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_extent_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +int +xfs_extent_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_extent_busy *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + return match; +} + +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entries block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_extent_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * This extent is currently being discarded. Give the thread + * performing the discard a chance to mark the extent unbusy + * and retry. + */ + if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { + spin_unlock(&pag->pagb_lock); + delay(1); + spin_lock(&pag->pagb_lock); + return false; + } + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ +void +xfs_extent_busy_reuse( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + + ASSERT(flen > 0); + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +void +xfs_extent_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata && + !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { + if (!xfs_extent_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +STATIC void +xfs_extent_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp) +{ + if (busyp->length) { + trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + + list_del_init(&busyp->list); + kmem_free(busyp); +} + +/* + * Remove all extents on the passed in list from the busy extents tree. + * If do_discard is set skip extents that need to be discarded, and mark + * these as undergoing a discard operation instead. + */ +void +xfs_extent_busy_clear( + struct xfs_mount *mp, + struct list_head *list, + bool do_discard) +{ + struct xfs_extent_busy *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + if (do_discard && busyp->length && + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + busyp->flags = XFS_EXTENT_BUSY_DISCARDED; + else + xfs_extent_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_extent_busy_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_extent_busy, list)->agno - + container_of(b, struct xfs_extent_busy, list)->agno; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h new file mode 100644 index 00000000000..985412d65ba --- /dev/null +++ b/fs/xfs/xfs_extent_busy.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTENT_BUSY_H__ +#define __XFS_EXTENT_BUSY_H__ + +/* + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_extent_busy_insert() for details. + */ +struct xfs_extent_busy { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + unsigned int flags; +#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ +#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ +}; + +void +xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); + +void +xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, + bool do_discard); + +int +xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +void +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, + xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); + +int +xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_extent_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_extent_busy_ag_cmp); +} + +#endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 35c2aff38b2..feb36d7551a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_sb.h" @@ -64,7 +63,8 @@ __xfs_efi_release( if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { spin_lock(&ailp->xa_lock); /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &efip->efi_item); + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); } } @@ -147,22 +147,20 @@ xfs_efi_item_unpin( } /* - * Efi items have no locking or pushing. However, since EFIs are - * pulled from the AIL when their corresponding EFDs are committed - * to disk, their situation is very similar to being pinned. Return - * XFS_ITEM_PINNED so that the caller will eventually flush the log. - * This should help in getting the EFI out of the AIL. + * Efi items have no locking or pushing. However, since EFIs are pulled from + * the AIL when their corresponding EFDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the EFI out of + * the AIL. */ STATIC uint -xfs_efi_item_trylock( - struct xfs_log_item *lip) +xfs_efi_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { return XFS_ITEM_PINNED; } -/* - * Efi items have no locking, so just return. - */ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) @@ -190,17 +188,6 @@ xfs_efi_item_committed( } /* - * There isn't much you can do to push on an efi item. It is simply - * stuck waiting for all of its corresponding efd items to be - * committed to disk. - */ -STATIC void -xfs_efi_item_push( - struct xfs_log_item *lip) -{ -} - -/* * The EFI dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For @@ -222,7 +209,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, .iop_unpin = xfs_efi_item_unpin, - .iop_trylock = xfs_efi_item_trylock, .iop_unlock = xfs_efi_item_unlock, .iop_committed = xfs_efi_item_committed, .iop_push = xfs_efi_item_push, @@ -404,19 +390,17 @@ xfs_efd_item_unpin( } /* - * Efd items have no locking, so just return success. + * There isn't much you can do to push on an efd item. It is simply stuck + * waiting for the log to be flushed to disk. */ STATIC uint -xfs_efd_item_trylock( - struct xfs_log_item *lip) +xfs_efd_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { - return XFS_ITEM_LOCKED; + return XFS_ITEM_PINNED; } -/* - * Efd items have no locking or pushing, so return failure - * so that the caller doesn't bother with us. - */ STATIC void xfs_efd_item_unlock( struct xfs_log_item *lip) @@ -451,16 +435,6 @@ xfs_efd_item_committed( } /* - * There isn't much you can do to push on an efd item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC void -xfs_efd_item_push( - struct xfs_log_item *lip) -{ -} - -/* * The EFD dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For @@ -482,7 +456,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = { .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, .iop_unpin = xfs_efd_item_unpin, - .iop_trylock = xfs_efd_item_trylock, .iop_unlock = xfs_efd_item_unlock, .iop_committed = xfs_efd_item_committed, .iop_push = xfs_efd_item_push, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 54a67dd9ac0..9f7ec15a652 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_trans.h" @@ -396,114 +394,96 @@ xfs_file_splice_write( } /* - * This routine is called to handle zeroing any space in the last - * block of the file that is beyond the EOF. We do this since the - * size is being increased without writing anything to that block - * and we don't want anyone to read the garbage on the disk. + * This routine is called to handle zeroing any space in the last block of the + * file that is beyond the EOF. We do this since the size is being increased + * without writing anything to that block and we don't want to read the + * garbage on the disk. */ STATIC int /* error (positive) */ xfs_zero_last_block( - xfs_inode_t *ip, - xfs_fsize_t offset, - xfs_fsize_t isize) + struct xfs_inode *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) { - xfs_fileoff_t last_fsb; - xfs_mount_t *mp = ip->i_mount; - int nimaps; - int zero_offset; - int zero_len; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - zero_offset = XFS_B_FSB_OFFSET(mp, isize); - if (zero_offset == 0) { - /* - * There are no extra bytes in the last block on disk to - * zero, so return. - */ - return 0; - } + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); + int zero_offset = XFS_B_FSB_OFFSET(mp, isize); + int zero_len; + int nimaps = 1; + int error = 0; + struct xfs_bmbt_irec imap; - last_fsb = XFS_B_TO_FSBT(mp, isize); - nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; + ASSERT(nimaps > 0); + /* * If the block underlying isize is just a hole, then there * is nothing to zero. */ - if (imap.br_startblock == HOLESTARTBLOCK) { + if (imap.br_startblock == HOLESTARTBLOCK) return 0; - } - /* - * Zero the part of the last block beyond the EOF, and write it - * out sync. We need to drop the ilock while we do this so we - * don't deadlock when the buffer cache calls back to us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); zero_len = mp->m_sb.sb_blocksize - zero_offset; if (isize + zero_len > offset) zero_len = offset - isize; - error = xfs_iozero(ip, isize, zero_len); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; + return xfs_iozero(ip, isize, zero_len); } /* - * Zero any on disk space between the current EOF and the new, - * larger EOF. This handles the normal case of zeroing the remainder - * of the last block in the file and the unusual case of zeroing blocks - * out beyond the size of the file. This second case only happens - * with fixed size extents and when the system crashes before the inode - * size was updated but after blocks were allocated. If fill is set, - * then any holes in the range are filled and zeroed. If not, the holes - * are left alone as holes. + * Zero any on disk space between the current EOF and the new, larger EOF. + * + * This handles the normal case of zeroing the remainder of the last block in + * the file and the unusual case of zeroing blocks out beyond the size of the + * file. This second case only happens with fixed size extents and when the + * system crashes before the inode size was updated but after blocks were + * allocated. + * + * Expects the iolock to be held exclusive, and will take the ilock internally. */ - int /* error (positive) */ xfs_zero_eof( - xfs_inode_t *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ + struct xfs_inode *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize) /* current inode size */ { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + struct xfs_bmbt_irec imap; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); /* * First handle zeroing the block on which isize resides. + * * We only zero a part of that block so it is handled specially. */ - error = xfs_zero_last_block(ip, offset, isize); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; + if (XFS_B_FSB_OFFSET(mp, isize) != 0) { + error = xfs_zero_last_block(ip, offset, isize); + if (error) + return error; } /* - * Calculate the range between the new size and the old - * where blocks needing to be zeroed may exist. To get the - * block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back - * to a block boundary. We subtract 1 in case the size is - * exactly on a block boundary. + * Calculate the range between the new size and the old where blocks + * needing to be zeroed may exist. + * + * To get the block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back to a block + * boundary. We subtract 1 in case the size is exactly on a block + * boundary. */ last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); @@ -521,23 +501,18 @@ xfs_zero_eof( while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, &imap, &nimaps, 0); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } + ASSERT(nimaps > 0); if (imap.br_state == XFS_EXT_UNWRITTEN || imap.br_startblock == HOLESTARTBLOCK) { - /* - * This loop handles initializing pages that were - * partially initialized by the code below this - * loop. It basically zeroes the part of the page - * that sits on a hole and sets the page as P_HOLE - * and calls remapf if it is a mapped file. - */ start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); continue; @@ -545,11 +520,7 @@ xfs_zero_eof( /* * There are blocks we need to zero. - * Drop the inode lock while we're doing the I/O. - * We'll still have the iolock to protect us. */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); @@ -557,22 +528,14 @@ xfs_zero_eof( zero_len = offset - zero_off; error = xfs_iozero(ip, zero_off, zero_len); - if (error) { - goto out_lock; - } + if (error) + return error; start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - - xfs_ilock(ip, XFS_ILOCK_EXCL); } return 0; - -out_lock: - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; } /* @@ -593,35 +556,29 @@ xfs_file_aio_write_checks( struct xfs_inode *ip = XFS_I(inode); int error = 0; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); - if (error) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which involves - * dropping all locks and relocking to maintain correct locking order. - * If we do this, restart the function to ensure all checks and values - * are still valid. + * iolock shared, we need to update it to exclusive which implies + * having to redo all checks before. */ if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); + if (error) + return error; } - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; /* * Updating the timestamps will grab the ilock again from @@ -629,8 +586,11 @@ restart: * lock above. Eventually we should look into a way to avoid * the pointless lock roundtrip. */ - if (likely(!(file->f_mode & FMODE_NOCMTIME))) - file_update_time(file); + if (likely(!(file->f_mode & FMODE_NOCMTIME))) { + error = file_update_time(file); + if (error) + return error; + } /* * If we're writing the file then make sure to clear the setuid and @@ -638,7 +598,6 @@ restart: * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); - } /* @@ -1007,8 +966,149 @@ xfs_vm_page_mkwrite( return block_page_mkwrite(vma, vmf, xfs_get_blocks); } +STATIC loff_t +xfs_seek_data( + struct file *file, + loff_t start, + u32 type) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec map[2]; + int nmap = 2; + loff_t uninitialized_var(offset); + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + xfs_filblks_t end; + uint lock; + int error; + + lock = xfs_ilock_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + + /* + * Try to read extents from the first block indicated + * by fsbno to the end block of the file. + */ + end = XFS_B_TO_FSB(mp, isize); + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* + * Treat unwritten extent as data extent since it might + * contains dirty data in page cache. + */ + if (map[0].br_startblock != HOLESTARTBLOCK) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[0].br_startoff)); + } else { + if (nmap == 1) { + error = ENXIO; + goto out_unlock; + } + + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + } + + if (offset != file->f_pos) + file->f_pos = offset; + +out_unlock: + xfs_iunlock_map_shared(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_seek_hole( + struct file *file, + loff_t start, + u32 type) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + loff_t uninitialized_var(offset); + loff_t holeoff; + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + uint lock; + int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + lock = xfs_ilock_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK); + if (error) + goto out_unlock; + + holeoff = XFS_FSB_TO_B(mp, fsbno); + if (holeoff <= start) + offset = start; + else { + /* + * xfs_bmap_first_unused() could return a value bigger than + * isize if there are no more holes past the supplied offset. + */ + offset = min_t(loff_t, holeoff, isize); + } + + if (offset != file->f_pos) + file->f_pos = offset; + +out_unlock: + xfs_iunlock_map_shared(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_file_llseek( + struct file *file, + loff_t offset, + int origin) +{ + switch (origin) { + case SEEK_END: + case SEEK_CUR: + case SEEK_SET: + return generic_file_llseek(file, offset, origin); + case SEEK_DATA: + return xfs_seek_data(file, offset, origin); + case SEEK_HOLE: + return xfs_seek_hole(file, offset, origin); + default: + return -EINVAL; + } +} + const struct file_operations xfs_file_operations = { - .llseek = generic_file_llseek, + .llseek = xfs_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = xfs_file_aio_read, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 1c6fdeb702f..c25b094efbf 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -18,8 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -39,7 +37,6 @@ #include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" -#include "xfs_rw.h" #include "xfs_filestream.h" #include "xfs_trace.h" @@ -147,9 +144,9 @@ xfs_growfs_data_private( if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; dpct = pct - mp->m_sb.sb_imax_pct; - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) return EIO; xfs_buf_relse(bp); @@ -193,7 +190,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; @@ -230,7 +227,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; @@ -259,8 +256,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -286,8 +282,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -314,8 +309,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -405,7 +399,7 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_read_buf(mp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); if (error) { @@ -693,3 +687,63 @@ xfs_fs_goingdown( return 0; } + +/* + * Force a shutdown of the filesystem instantly while keeping the filesystem + * consistent. We don't do an unmount here; just shutdown the shop, make sure + * that absolutely nothing persistent happens to this filesystem after this + * point. + */ +void +xfs_do_force_shutdown( + xfs_mount_t *mp, + int flags, + char *fname, + int lnnum) +{ + int logerror; + + logerror = flags & SHUTDOWN_LOG_IO_ERROR; + + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_notice(mp, + "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + __func__, flags, lnnum, fname, __return_address); + } + /* + * No need to duplicate efforts. + */ + if (XFS_FORCED_SHUTDOWN(mp) && !logerror) + return; + + /* + * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't + * queue up anybody new on the log reservations, and wakes up + * everybody who's sleeping on log reservations to tell them + * the bad news. + */ + if (xfs_log_force_umount(mp, logerror)) + return; + + if (flags & SHUTDOWN_CORRUPT_INCORE) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, + "Corruption of in-memory data detected. Shutting down filesystem"); + if (XFS_ERRLEVEL_HIGH <= xfs_error_level) + xfs_stack_trace(); + } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); + } + } + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_alert(mp, + "Please umount the filesystem and rectify the problem(s)"); + } +} diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index dad1a31aa4f..177a21a7ac4 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -200,8 +200,7 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); + mp->m_bsize * blks_per_cluster, 0); if (!fbuf) return ENOMEM; /* @@ -610,6 +609,13 @@ xfs_ialloc_get_rec( /* * Visible inode allocation functions. */ +/* + * Find a free (set) bit in the inode bitmask. + */ +static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) +{ + return xfs_lowbit64(*fp); +} /* * Allocate an inode on disk. diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 666a037398d..65ac57c8063 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -47,15 +47,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) } /* - * Find a free (set) bit in the inode bitmask. - */ -static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) -{ - return xfs_lowbit64(*fp); -} - - -/* * Allocate an inode on disk. * Mode is used to tell whether the new inode will need space, and whether * it is a directory. diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index c6a75815aea..2b8b7a37aa1 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index bcc6c249b2c..1bb4365e8c2 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_acl.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -123,23 +122,7 @@ xfs_inode_free( xfs_idestroy_fork(ip, XFS_ATTR_FORK); if (ip->i_itemp) { - /* - * Only if we are shutting down the fs will we see an - * inode still in the AIL. If it is there, we should remove - * it to prevent a use-after-free from occurring. - */ - xfs_log_item_t *lip = &ip->i_itemp->ili_item; - struct xfs_ail *ailp = lip->li_ailp; - - ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, lip); - else - spin_unlock(&ailp->xa_lock); - } + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); xfs_inode_item_destroy(ip); ip->i_itemp = NULL; } @@ -334,9 +317,10 @@ xfs_iget_cache_miss( /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload - * region. + * region. Since we can be called from transaction context, don't + * recurse into the file system. */ - if (radix_tree_preload(GFP_KERNEL)) { + if (radix_tree_preload(GFP_NOFS)) { error = EAGAIN; goto out_destroy; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bc46c0a133d..a59eea09930 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -20,7 +20,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -61,6 +60,20 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer @@ -137,6 +150,7 @@ xfs_imap_to_bp( int ni; xfs_buf_t *bp; + buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp); if (error) { @@ -226,7 +240,7 @@ xfs_inotobp( if (error) return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags); if (error) return error; @@ -782,8 +796,7 @@ xfs_iread( /* * Get pointers to the on-disk inode and the buffer containing it. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, - XBF_LOCK, iget_flags); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags); if (error) return error; dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -1342,7 +1355,7 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) return error; @@ -1423,7 +1436,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp() returned error %d.", __func__, error); @@ -1484,7 +1497,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", __func__, error); @@ -1566,8 +1579,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); + mp->m_bsize * blks_per_cluster, 0); if (!bp) return ENOMEM; @@ -1737,7 +1749,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0); if (error) return error; @@ -2347,11 +2359,11 @@ cluster_corrupt_out: */ rcu_read_unlock(); /* - * Clean up the buffer. If it was B_DELWRI, just release it -- + * Clean up the buffer. If it was delwri, just release it -- * brelse can handle it with no problems. If not, shut down the * filesystem before releasing the buffer. */ - bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); + bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); if (bufwasdelwri) xfs_buf_relse(bp); @@ -2377,30 +2389,29 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq); + xfs_iflush_abort(iq, false); kmem_free(ilist); xfs_perag_put(pag); return XFS_ERROR(EFSCORRUPTED); } /* - * xfs_iflush() will write a modified inode's changes out to the - * inode's on disk home. The caller must have the inode lock held - * in at least shared mode and the inode flush completion must be - * active as well. The inode lock will still be held upon return from - * the call and the caller is free to unlock it. - * The inode flush will be completed when the inode reaches the disk. - * The flags indicate how the inode's buffer should be written out. + * Flush dirty inode metadata into the backing buffer. + * + * The caller must have the inode lock and the inode flush lock held. The + * inode lock will still be held upon return to the caller, and the inode + * flush lock will be released after the inode has reached the disk. + * + * The caller must write out the buffer returned in *bpp and release it. */ int xfs_iflush( - xfs_inode_t *ip, - uint flags) + struct xfs_inode *ip, + struct xfs_buf **bpp) { - xfs_inode_log_item_t *iip; - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_mount_t *mp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + struct xfs_dinode *dip; int error; XFS_STATS_INC(xs_iflush_count); @@ -2410,25 +2421,8 @@ xfs_iflush( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - iip = ip->i_itemp; - mp = ip->i_mount; + *bpp = NULL; - /* - * We can't flush the inode until it is unpinned, so wait for it if we - * are allowed to block. We know no one new can pin it, because we are - * holding the inode lock shared and you need to hold it exclusively to - * pin the inode. - * - * If we are not allowed to block, force the log out asynchronously so - * that when we come back the inode will be unpinned. If other inodes - * in the same cluster are dirty, they will probably write the inode - * out for us if they occur after the log force completes. - */ - if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { - xfs_iunpin(ip); - xfs_ifunlock(ip); - return EAGAIN; - } xfs_iunpin_wait(ip); /* @@ -2447,20 +2441,20 @@ xfs_iflush( /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk! + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an empty AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { - if (iip) - iip->ili_fields = 0; - xfs_ifunlock(ip); - return XFS_ERROR(EIO); + error = XFS_ERROR(EIO); + goto abort_out; } /* * Get the buffer containing the on-disk inode. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, - (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK); if (error || !bp) { xfs_ifunlock(ip); return error; @@ -2488,23 +2482,20 @@ xfs_iflush( if (error) goto cluster_corrupt_out; - if (flags & SYNC_WAIT) - error = xfs_bwrite(bp); - else - xfs_buf_delwri_queue(bp); - - xfs_buf_relse(bp); - return error; + *bpp = bp; + return 0; corrupt_out: xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: + error = XFS_ERROR(EFSCORRUPTED); +abort_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(ip); - return XFS_ERROR(EFSCORRUPTED); + xfs_iflush_abort(ip, false); + return error; } @@ -2706,27 +2697,6 @@ corrupt_out: return XFS_ERROR(EFSCORRUPTED); } -void -xfs_promote_inode( - struct xfs_inode *ip) -{ - struct xfs_buf *bp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - - bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, - ip->i_imap.im_len, XBF_TRYLOCK); - if (!bp) - return; - - if (XFS_BUF_ISDELAYWRITE(bp)) { - xfs_buf_delwri_promote(bp); - wake_up_process(ip->i_mount->m_ddev_targp->bt_task); - } - - xfs_buf_relse(bp); -} - /* * Return a pointer to the extent record at file index idx. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7fee3387e1c..1efff36a75b 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -529,11 +529,12 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); -int xfs_iflush(xfs_inode_t *, uint); -void xfs_promote_inode(struct xfs_inode *); +int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 05d924efcea..6cdbf90c6f7 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -480,25 +478,16 @@ xfs_inode_item_unpin( wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } -/* - * This is called to attempt to lock the inode associated with this - * inode log item, in preparation for the push routine which does the actual - * iflush. Don't sleep on the inode lock or the flush lock. - * - * If the flush lock is already held, indicating that the inode has - * been or is in the process of being flushed, then (ideally) we'd like to - * see if the inode's buffer is still incore, and if so give it a nudge. - * We delay doing so until the pushbuf routine, though, to avoid holding - * the AIL lock across a call to the blackhole which is the buffer cache. - * Also we don't want to sleep in any device strategy routines, which can happen - * if we do the subsequent bawrite in here. - */ STATIC uint -xfs_inode_item_trylock( - struct xfs_log_item *lip) +xfs_inode_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; @@ -506,30 +495,50 @@ xfs_inode_item_trylock( if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; + /* + * Re-check the pincount now that we stabilized the value by + * taking the ilock. + */ + if (xfs_ipincount(ip) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the inode. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ if (!xfs_iflock_nowait(ip)) { - /* - * inode has already been flushed to the backing buffer, - * leave it locked in shared mode, pushbuf routine will - * unlock it. - */ - return XFS_ITEM_PUSHBUF; + rval = XFS_ITEM_FLUSHING; + goto out_unlock; } - /* Stale items should force out the iclog */ + /* + * Stale inode items should force out the iclog. + */ if (ip->i_flags & XFS_ISTALE) { xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); return XFS_ITEM_PINNED; } -#ifdef DEBUG - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ASSERT(iip->ili_fields != 0); - ASSERT(iip->ili_logged == 0); - ASSERT(lip->li_flags & XFS_LI_IN_AIL); + ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_iflush(ip, &bp); + if (!error) { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); } -#endif - return XFS_ITEM_SUCCESS; + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return rval; } /* @@ -614,86 +623,6 @@ xfs_inode_item_committed( } /* - * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK - * failed to get the inode flush lock but did get the inode locked SHARED. - * Here we're trying to see if the inode buffer is incore, and if so whether it's - * marked delayed write. If that's the case, we'll promote it and that will - * allow the caller to write the buffer by triggering the xfsbufd to run. - */ -STATIC bool -xfs_inode_item_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - struct xfs_inode *ip = iip->ili_inode; - struct xfs_buf *bp; - bool ret = true; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - - /* - * If a flush is not in progress anymore, chances are that the - * inode was taken off the AIL. So, just get out. - */ - if (!xfs_isiflocked(ip) || - !(lip->li_flags & XFS_LI_IN_AIL)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return true; - } - - bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, - iip->ili_format.ilf_len, XBF_TRYLOCK); - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (!bp) - return true; - if (XFS_BUF_ISDELAYWRITE(bp)) - xfs_buf_delwri_promote(bp); - if (xfs_buf_ispinned(bp)) - ret = false; - xfs_buf_relse(bp); - return ret; -} - -/* - * This is called to asynchronously write the inode associated with this - * inode log item out to disk. The inode will already have been locked by - * a successful call to xfs_inode_item_trylock(). - */ -STATIC void -xfs_inode_item_push( - struct xfs_log_item *lip) -{ - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - struct xfs_inode *ip = iip->ili_inode; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - - /* - * Since we were able to lock the inode's flush lock and - * we found it on the AIL, the inode must be dirty. This - * is because the inode is removed from the AIL while still - * holding the flush lock in xfs_iflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the inode. - */ - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0); - - /* - * Push the inode to it's backing buffer. This will not remove the - * inode from the AIL - a further push will be required to trigger a - * buffer push. However, this allows all the dirty inodes to be pushed - * to the buffer before it is pushed to disk. The buffer IO completion - * will pull the inode from the AIL, mark it clean and unlock the flush - * lock. - */ - (void) xfs_iflush(ip, SYNC_TRYLOCK); - xfs_iunlock(ip, XFS_ILOCK_SHARED); -} - -/* * XXX rcc - this one really has to do something. Probably needs * to stamp in a new field in the incore inode. */ @@ -713,11 +642,9 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, .iop_unpin = xfs_inode_item_unpin, - .iop_trylock = xfs_inode_item_trylock, .iop_unlock = xfs_inode_item_unlock, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_pushbuf = xfs_inode_item_pushbuf, .iop_committing = xfs_inode_item_committing }; @@ -848,7 +775,8 @@ xfs_iflush_done( ASSERT(i <= need_ail); } /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ - xfs_trans_ail_delete_bulk(ailp, log_items, i); + xfs_trans_ail_delete_bulk(ailp, log_items, i, + SHUTDOWN_CORRUPT_INCORE); } @@ -869,16 +797,15 @@ xfs_iflush_done( } /* - * This is the inode flushing abort routine. It is called - * from xfs_iflush when the filesystem is shutting down to clean - * up the inode state. - * It is responsible for removing the inode item - * from the AIL if it has not been re-logged, and unlocking the inode's - * flush lock. + * This is the inode flushing abort routine. It is called from xfs_iflush when + * the filesystem is shutting down to clean up the inode state. It is + * responsible for removing the inode item from the AIL if it has not been + * re-logged, and unlocking the inode's flush lock. */ void xfs_iflush_abort( - xfs_inode_t *ip) + xfs_inode_t *ip, + bool stale) { xfs_inode_log_item_t *iip = ip->i_itemp; @@ -888,7 +815,10 @@ xfs_iflush_abort( spin_lock(&ailp->xa_lock); if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip); + xfs_trans_ail_delete(ailp, &iip->ili_item, + stale ? + SHUTDOWN_LOG_IO_ERROR : + SHUTDOWN_CORRUPT_INCORE); } else spin_unlock(&ailp->xa_lock); } @@ -915,7 +845,7 @@ xfs_istale_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 41d61c3b7a3..376d4d0b263 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -165,7 +165,7 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_abort(struct xfs_inode *, bool); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, xfs_inode_log_format_t *); diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h index b253c0ea5be..90efdaf1706 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/xfs_inum.h @@ -26,11 +26,6 @@ * high agno_log-agblklog-inopblog bits - 0 */ -typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */ - -#define NULLFSINO ((xfs_ino_t)-1) -#define NULLAGINO ((xfs_agino_t)-1) - struct xfs_mount; #define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 91f8ff547ab..3a05a41b5d7 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index a849a5473af..c4f2da0d2bf 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -22,9 +22,7 @@ #include <asm/uaccess.h> #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 71a464503c4..aadfce6681e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -37,7 +35,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -142,11 +139,7 @@ xfs_iomap_write_direct( int committed; int error; - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach(ip, 0); if (error) return XFS_ERROR(error); @@ -158,7 +151,7 @@ xfs_iomap_write_direct( if ((offset + count) > XFS_ISIZE(ip)) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) - goto error_out; + return XFS_ERROR(error); } else { if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) @@ -190,7 +183,6 @@ xfs_iomap_write_direct( /* * Allocate and setup the transaction */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), resrtextents, @@ -199,15 +191,16 @@ xfs_iomap_write_direct( /* * Check for running out of space, note: need lock to return */ - if (error) + if (error) { xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL); - if (error) - goto error_out; error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) - goto error1; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); @@ -224,42 +217,39 @@ xfs_iomap_write_direct( error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, &firstfsb, 0, imap, &nimaps, &free_list); if (error) - goto error0; + goto out_bmap_cancel; /* * Complete the transaction */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto error0; + goto out_bmap_cancel; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) - goto error_out; + goto out_unlock; /* * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { - error = ENOSPC; - goto error_out; + error = XFS_ERROR(ENOSPC); + goto out_unlock; } - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) error = xfs_alert_fsblock_zero(ip, imap); - goto error_out; - } - return 0; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; -error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ +out_bmap_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); +out_trans_cancel: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - -error_out: - return XFS_ERROR(error); + goto out_unlock; } /* @@ -422,6 +412,15 @@ retry: return error; } + /* + * Make sure preallocation does not create extents beyond the range we + * actually support in this filesystem. + */ + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset); + + ASSERT(last_fsb > offset_fsb); + nimaps = XFS_WRITE_IMAPS; error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, imap, &nimaps, XFS_BMAPI_ENTIRE); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 3011b879f85..1a25fd80279 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_acl.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -34,7 +32,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -700,7 +697,7 @@ xfs_setattr_size( xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; - uint lock_flags; + uint lock_flags = 0; uint commit_flags = 0; trace_xfs_setattr(ip); @@ -720,10 +717,10 @@ xfs_setattr_size( ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); - lock_flags = XFS_ILOCK_EXCL; - if (!(flags & XFS_ATTR_NOLOCK)) + if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; - xfs_ilock(ip, lock_flags); + xfs_ilock(ip, lock_flags); + } oldsize = inode->i_size; newsize = iattr->ia_size; @@ -746,7 +743,7 @@ xfs_setattr_size( /* * Make sure that the dquots are attached to the inode. */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach(ip, 0); if (error) goto out_unlock; @@ -768,8 +765,6 @@ xfs_setattr_size( if (error) goto out_unlock; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - lock_flags &= ~XFS_ILOCK_EXCL; /* * We are going to log the inode size change in this transaction so diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index acc2bf264da..eff577a9b67 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6db1fef38bf..f30d9807dc4 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -35,7 +33,6 @@ #include "xfs_trans_priv.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_rw.h" #include "xfs_trace.h" kmem_zone_t *xfs_log_ticket_zone; @@ -916,27 +913,42 @@ xfs_log_need_covered(xfs_mount_t *mp) * We may be holding the log iclog lock upon entering this routine. */ xfs_lsn_t -xlog_assign_tail_lsn( +xlog_assign_tail_lsn_locked( struct xfs_mount *mp) { - xfs_lsn_t tail_lsn; struct log *log = mp->m_log; + struct xfs_log_item *lip; + xfs_lsn_t tail_lsn; + + assert_spin_locked(&mp->m_ail->xa_lock); /* * To make sure we always have a valid LSN for the log tail we keep * track of the last LSN which was committed in log->l_last_sync_lsn, - * and use that when the AIL was empty and xfs_ail_min_lsn returns 0. - * - * If the AIL has been emptied we also need to wake any process - * waiting for this condition. + * and use that when the AIL was empty. */ - tail_lsn = xfs_ail_min_lsn(mp->m_ail); - if (!tail_lsn) + lip = xfs_ail_min(mp->m_ail); + if (lip) + tail_lsn = lip->li_lsn; + else tail_lsn = atomic64_read(&log->l_last_sync_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; } +xfs_lsn_t +xlog_assign_tail_lsn( + struct xfs_mount *mp) +{ + xfs_lsn_t tail_lsn; + + spin_lock(&mp->m_ail->xa_lock); + tail_lsn = xlog_assign_tail_lsn_locked(mp); + spin_unlock(&mp->m_ail->xa_lock); + + return tail_lsn; +} + /* * Return the space in the log between the tail and the head. The head * is passed in the cycle/bytes formal parms. In the special case where @@ -1172,7 +1184,7 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); error = ENOMEM; - bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0); + bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; @@ -1182,9 +1194,6 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_icloglock); init_waitqueue_head(&log->l_flush_wait); - /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ - ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); - iclogp = &log->l_iclog; /* * The amount of memory to allocate for the iclog structure is @@ -1204,7 +1213,7 @@ xlog_alloc_log(xfs_mount_t *mp, prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - log->l_iclog_size, 0); + BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_iclog; @@ -1224,7 +1233,7 @@ xlog_alloc_log(xfs_mount_t *mp, head->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; + iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); @@ -1475,7 +1484,7 @@ xlog_sync(xlog_t *log, } else { iclog->ic_bwritecnt = 1; } - XFS_BUF_SET_COUNT(bp, count); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); XFS_BUF_ASYNC(bp); @@ -1573,7 +1582,7 @@ xlog_dealloc_log(xlog_t *log) * always need to ensure that the extra buffer does not point to memory * owned by another log buffer before we free it. */ - xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size); + xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; @@ -2932,6 +2941,7 @@ xfs_log_force( { int error; + trace_xfs_log_force(mp, 0); error = _xfs_log_force(mp, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3080,6 +3090,7 @@ xfs_log_force_lsn( { int error; + trace_xfs_log_force(mp, lsn); error = _xfs_log_force_lsn(mp, lsn, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3141,7 +3152,7 @@ xlog_ticket_alloc( int cnt, char client, bool permanent, - int alloc_flags) + xfs_km_flags_t alloc_flags) { struct xlog_ticket *tic; uint num_headers; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 2c622bedb30..748d312850e 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -152,6 +152,7 @@ int xfs_log_mount(struct xfs_mount *mp, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); +xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); int xfs_log_notify(struct xfs_mount *mp, struct xlog_in_core *iclog, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index d4fadbe8ac9..7d6197c5849 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log_priv.h" @@ -29,61 +27,10 @@ #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_discard.h" /* - * Perform initial CIL structure initialisation. - */ -int -xlog_cil_init( - struct log *log) -{ - struct xfs_cil *cil; - struct xfs_cil_ctx *ctx; - - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); - if (!cil) - return ENOMEM; - - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); - if (!ctx) { - kmem_free(cil); - return ENOMEM; - } - - INIT_LIST_HEAD(&cil->xc_cil); - INIT_LIST_HEAD(&cil->xc_committing); - spin_lock_init(&cil->xc_cil_lock); - init_rwsem(&cil->xc_ctx_lock); - init_waitqueue_head(&cil->xc_commit_wait); - - INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); - ctx->sequence = 1; - ctx->cil = cil; - cil->xc_ctx = ctx; - cil->xc_current_sequence = ctx->sequence; - - cil->xc_log = log; - log->l_cilp = cil; - return 0; -} - -void -xlog_cil_destroy( - struct log *log) -{ - if (log->l_cilp->xc_ctx) { - if (log->l_cilp->xc_ctx->ticket) - xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); - kmem_free(log->l_cilp->xc_ctx); - } - - ASSERT(list_empty(&log->l_cilp->xc_cil)); - kmem_free(log->l_cilp); -} - -/* * Allocate a new ticket. Failing to get a new ticket makes it really hard to * recover, so we don't allow failure here. Also, we allocate in a context that * we don't want to be issuing transactions from, so we need to tell the @@ -390,8 +337,8 @@ xlog_cil_committed( xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); - xfs_alloc_busy_sort(&ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, + xfs_extent_busy_sort(&ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); spin_lock(&ctx->cil->xc_cil_lock); @@ -404,7 +351,7 @@ xlog_cil_committed( ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); xfs_discard_extents(mp, &ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); } kmem_free(ctx); @@ -426,8 +373,7 @@ xlog_cil_committed( */ STATIC int xlog_cil_push( - struct log *log, - xfs_lsn_t push_seq) + struct log *log) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_vec *lv; @@ -443,39 +389,36 @@ xlog_cil_push( struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; xfs_lsn_t commit_lsn; + xfs_lsn_t push_seq; if (!cil) return 0; - ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); - /* - * Lock out transaction commit, but don't block for background pushes - * unless we are well over the CIL space limit. See the definition of - * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic - * used here. - */ - if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_seq && - cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) - goto out_free_ticket; - down_write(&cil->xc_ctx_lock); - } + down_write(&cil->xc_ctx_lock); ctx = cil->xc_ctx; - /* check if we've anything to push */ - if (list_empty(&cil->xc_cil)) - goto out_skip; + spin_lock(&cil->xc_cil_lock); + push_seq = cil->xc_push_seq; + ASSERT(push_seq <= ctx->sequence); - /* check for spurious background flush */ - if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + /* + * Check if we've anything to push. If there is nothing, then we don't + * move on to a new sequence number and so we have to be able to push + * this sequence again later. + */ + if (list_empty(&cil->xc_cil)) { + cil->xc_push_seq = 0; + spin_unlock(&cil->xc_cil_lock); goto out_skip; + } + spin_unlock(&cil->xc_cil_lock); + /* check for a previously pushed seqeunce */ - if (push_seq && push_seq < cil->xc_ctx->sequence) + if (push_seq < cil->xc_ctx->sequence) goto out_skip; /* @@ -629,7 +572,6 @@ restart: out_skip: up_write(&cil->xc_ctx_lock); -out_free_ticket: xfs_log_ticket_put(new_ctx->ticket); kmem_free(new_ctx); return 0; @@ -641,6 +583,82 @@ out_abort: return XFS_ERROR(EIO); } +static void +xlog_cil_push_work( + struct work_struct *work) +{ + struct xfs_cil *cil = container_of(work, struct xfs_cil, + xc_push_work); + xlog_cil_push(cil->xc_log); +} + +/* + * We need to push CIL every so often so we don't cache more than we can fit in + * the log. The limit really is that a checkpoint can't be more than half the + * log (the current checkpoint is not allowed to overwrite the previous + * checkpoint), but commit latency and memory usage limit this to a smaller + * size. + */ +static void +xlog_cil_push_background( + struct log *log) +{ + struct xfs_cil *cil = log->l_cilp; + + /* + * The cil won't be empty because we are called while holding the + * context lock so whatever we added to the CIL will still be there + */ + ASSERT(!list_empty(&cil->xc_cil)); + + /* + * don't do a background push if we haven't used up all the + * space available yet. + */ + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + return; + + spin_lock(&cil->xc_cil_lock); + if (cil->xc_push_seq < cil->xc_current_sequence) { + cil->xc_push_seq = cil->xc_current_sequence; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + } + spin_unlock(&cil->xc_cil_lock); + +} + +static void +xlog_cil_push_foreground( + struct log *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + + if (!cil) + return; + + ASSERT(push_seq && push_seq <= cil->xc_current_sequence); + + /* start on any pending background push to minimise wait time on it */ + flush_work(&cil->xc_push_work); + + /* + * If the CIL is empty or we've already pushed the sequence then + * there's no work we need to do. + */ + spin_lock(&cil->xc_cil_lock); + if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + spin_unlock(&cil->xc_cil_lock); + return; + } + + cil->xc_push_seq = push_seq; + spin_unlock(&cil->xc_cil_lock); + + /* do the push now */ + xlog_cil_push(log); +} + /* * Commit a transaction with the given vector to the Committed Item List. * @@ -667,7 +685,6 @@ xfs_log_commit_cil( { struct log *log = mp->m_log; int log_flags = 0; - int push = 0; struct xfs_log_vec *log_vector; if (flags & XFS_TRANS_RELEASE_LOG_RES) @@ -719,21 +736,9 @@ xfs_log_commit_cil( */ xfs_trans_free_items(tp, *commit_lsn, 0); - /* check for background commit before unlock */ - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) - push = 1; + xlog_cil_push_background(log); up_read(&log->l_cilp->xc_ctx_lock); - - /* - * We need to push CIL every so often so we don't cache more than we - * can fit in the log. The limit really is that a checkpoint can't be - * more than half the log (the current checkpoint is not allowed to - * overwrite the previous checkpoint), but commit latency and memory - * usage limit this to a smaller size in most cases. - */ - if (push) - xlog_cil_push(log, 0); return 0; } @@ -746,9 +751,6 @@ xfs_log_commit_cil( * * We return the current commit lsn to allow the callers to determine if a * iclog flush is necessary following this call. - * - * XXX: Initially, just push the CIL unconditionally and return whatever - * commit lsn is there. It'll be empty, so this is broken for now. */ xfs_lsn_t xlog_cil_force_lsn( @@ -766,8 +768,7 @@ xlog_cil_force_lsn( * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ - if (sequence == cil->xc_current_sequence) - xlog_cil_push(log, sequence); + xlog_cil_push_foreground(log, sequence); /* * See if we can find a previous sequence still committing. @@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt( return false; return true; } + +/* + * Perform initial CIL structure initialisation. + */ +int +xlog_cil_init( + struct log *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_commit_wait); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct log *log) +{ + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2152900b79d..5bc33261f5b 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -417,6 +417,8 @@ struct xfs_cil { struct list_head xc_committing; wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; + xfs_lsn_t xc_push_seq; }; /* @@ -553,7 +555,7 @@ extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, int count, char client, bool permanent, - int alloc_flags); + xfs_km_flags_t alloc_flags); static inline void diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8ecad5bad66..ca386909131 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -40,7 +40,6 @@ #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" #include "xfs_quota.h" -#include "xfs_rw.h" #include "xfs_utils.h" #include "xfs_trace.h" @@ -120,7 +119,7 @@ xlog_get_bp( nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0); + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); if (bp) xfs_buf_unlock(bp); return bp; @@ -146,7 +145,7 @@ xlog_align( { xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(offset + nbblks <= bp->b_length); return bp->b_addr + BBTOB(offset); } @@ -174,11 +173,12 @@ xlog_bread_noalign( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); @@ -218,7 +218,7 @@ xlog_bread_offset( xfs_caddr_t offset) { xfs_caddr_t orig_offset = bp->b_addr; - int orig_len = bp->b_buffer_length; + int orig_len = BBTOB(bp->b_length); int error, error2; error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); @@ -259,13 +259,14 @@ xlog_bwrite( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); xfs_buf_hold(bp); xfs_buf_lock(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; error = xfs_bwrite(bp); if (error) @@ -440,6 +441,8 @@ xlog_find_verify_cycle( * a log sector, or we're out of luck. */ bufblks = 1 << ffs(nbblks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < log->l_sectBBsize) @@ -1225,6 +1228,8 @@ xlog_write_log_records( * log sector, or we're out of luck. */ bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) @@ -1772,7 +1777,7 @@ xlog_recover_do_inode_buffer( trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); @@ -1814,7 +1819,8 @@ xlog_recover_do_inode_buffer( ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); /* * The current logged region contains a copy of the @@ -1873,8 +1879,8 @@ xlog_recover_do_reg_buffer( ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* * Do a sanity check if this is a dquot buffer. Just checking @@ -2103,6 +2109,7 @@ xlog_recover_do_dquot_buffer( STATIC int xlog_recover_buffer_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; @@ -2123,9 +2130,9 @@ xlog_recover_buffer_pass2( trace_xfs_log_recover_buf_recover(log, buf_f); - buf_flags = XBF_LOCK; - if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF)) - buf_flags |= XBF_MAPPED; + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags); @@ -2166,14 +2173,14 @@ xlog_recover_buffer_pass2( */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); } xfs_buf_relse(bp); @@ -2183,6 +2190,7 @@ xlog_recover_buffer_pass2( STATIC int xlog_recover_inode_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_inode_log_format_t *in_f; @@ -2220,8 +2228,7 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - XBF_LOCK); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); if (!bp) { error = ENOMEM; goto error; @@ -2436,7 +2443,7 @@ xlog_recover_inode_pass2( write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); error: if (need_free) @@ -2477,6 +2484,7 @@ xlog_recover_quotaoff_pass1( STATIC int xlog_recover_dquot_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_mount_t *mp = log->l_mp; @@ -2530,14 +2538,11 @@ xlog_recover_dquot_pass2( return XFS_ERROR(EIO); ASSERT(dq_f->qlf_len == 1); - error = xfs_read_buf(mp, mp->m_ddev_targp, - dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), - 0, &bp); - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)"); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + if (error) return error; - } + ASSERT(bp); ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); @@ -2558,7 +2563,7 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); return (0); @@ -2642,7 +2647,8 @@ xlog_recover_efd_pass2( * xfs_trans_ail_delete() drops the * AIL lock. */ - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); xfs_efi_item_free(efip); spin_lock(&ailp->xa_lock); break; @@ -2712,21 +2718,22 @@ STATIC int xlog_recover_commit_pass2( struct log *log, struct xlog_recover *trans, + struct list_head *buffer_list, xlog_recover_item_t *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, item); + return xlog_recover_buffer_pass2(log, buffer_list, item); case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, item); + return xlog_recover_inode_pass2(log, buffer_list, item); case XFS_LI_EFI: return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, item); + return xlog_recover_dquot_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: /* nothing to do in pass2 */ return 0; @@ -2750,8 +2757,9 @@ xlog_recover_commit_trans( struct xlog_recover *trans, int pass) { - int error = 0; + int error = 0, error2; xlog_recover_item_t *item; + LIST_HEAD (buffer_list); hlist_del(&trans->r_list); @@ -2760,16 +2768,27 @@ xlog_recover_commit_trans( return error; list_for_each_entry(item, &trans->r_itemq, ri_list) { - if (pass == XLOG_RECOVER_PASS1) + switch (pass) { + case XLOG_RECOVER_PASS1: error = xlog_recover_commit_pass1(log, trans, item); - else - error = xlog_recover_commit_pass2(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + error = xlog_recover_commit_pass2(log, trans, + &buffer_list, item); + break; + default: + ASSERT(0); + } + if (error) - return error; + goto out; } xlog_recover_free_trans(trans); - return 0; + +out: + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; } STATIC int @@ -3079,7 +3098,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0); if (error) goto fail_iput; @@ -3639,11 +3658,8 @@ xlog_do_recover( * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); - if (error) { + if (error) return error; - } - - xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1); /* * If IO errors happened during recovery, bail out. @@ -3670,7 +3686,6 @@ xlog_do_recover( bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); xfsbdstrat(log->l_mp, bp); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bd672def95a..331cd9f83a7 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 1ffead4b229..536021fb3d4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -22,6 +22,7 @@ #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" @@ -37,7 +38,6 @@ #include "xfs_rtalloc.h" #include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_fsops.h" #include "xfs_utils.h" @@ -683,8 +683,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); reread: - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, - XFS_SB_DADDR, sector_size, 0); + bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -1032,9 +1032,9 @@ xfs_check_sizes(xfs_mount_t *mp) xfs_warn(mp, "filesystem size mismatch detected"); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1047,9 +1047,9 @@ xfs_check_sizes(xfs_mount_t *mp) xfs_warn(mp, "log size mismatch detected"); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, + bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; @@ -1288,7 +1288,7 @@ xfs_mountfs( XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); if (error) { xfs_warn(mp, "log mount failed"); - goto out_free_perag; + goto out_fail_wait; } /* @@ -1315,7 +1315,7 @@ xfs_mountfs( !mp->m_sb.sb_inprogress) { error = xfs_initialize_perag_data(mp, sbp->sb_agcount); if (error) - goto out_free_perag; + goto out_fail_wait; } /* @@ -1439,6 +1439,10 @@ xfs_mountfs( IRELE(rip); out_log_dealloc: xfs_log_unmount(mp); + out_fail_wait: + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_wait_buftarg(mp->m_logdev_targp); + xfs_wait_buftarg(mp->m_ddev_targp); out_free_perag: xfs_free_perag(mp); out_remove_uuid: @@ -1475,15 +1479,15 @@ xfs_unmountfs( xfs_log_force(mp, XFS_LOG_SYNC); /* - * Do a delwri reclaim pass first so that as many dirty inodes are - * queued up for IO as possible. Then flush the buffers before making - * a synchronous path to catch all the remaining inodes are reclaimed. - * This makes the reclaim process as quick as possible by avoiding - * synchronous writeout and blocking on inodes already in the delwri - * state as much as possible. + * Flush all pending changes from the AIL. + */ + xfs_ail_push_all_sync(mp->m_ail); + + /* + * And reclaim all inodes. At this point there should be no dirty + * inode, and none should be pinned or locked, but use synchronous + * reclaim just to be sure. */ - xfs_reclaim_inodes(mp, 0); - xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); @@ -1519,15 +1523,12 @@ xfs_unmountfs( if (error) xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - xfs_unmountfs_writesb(mp); /* - * Make sure all buffers have been flushed and completed before - * unmounting the log. + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. */ - error = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (error) - xfs_warn(mp, "%d busy buffers during unmount.", error); + xfs_ail_push_all_sync(mp->m_ail); xfs_wait_buftarg(mp->m_ddev_targp); xfs_log_unmount_write(mp); @@ -1588,36 +1589,6 @@ xfs_log_sbcount(xfs_mount_t *mp) return error; } -int -xfs_unmountfs_writesb(xfs_mount_t *mp) -{ - xfs_buf_t *sbp; - int error = 0; - - /* - * skip superblock write if fs is read-only, or - * if we are doing a forced umount. - */ - if (!((mp->m_flags & XFS_MOUNT_RDONLY) || - XFS_FORCED_SHUTDOWN(mp))) { - - sbp = xfs_getsb(mp, 0); - - XFS_BUF_UNDONE(sbp); - XFS_BUF_UNREAD(sbp); - xfs_buf_delwri_dequeue(sbp); - XFS_BUF_WRITE(sbp); - XFS_BUF_UNASYNC(sbp); - ASSERT(sbp->b_target == mp->m_ddev_targp); - xfsbdstrat(mp, sbp); - error = xfs_buf_iowait(sbp); - if (error) - xfs_buf_ioerror_alert(sbp, __func__); - xfs_buf_relse(sbp); - } - return error; -} - /* * xfs_mod_sb() can be used to copy arbitrary changes to the * in-core superblock into the superblock buffer to be logged. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9eba7388782..8b89c5ac72d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -214,6 +214,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; } xfs_mount_t; /* @@ -378,7 +379,6 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 55c6afedc87..249db198776 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -65,7 +64,8 @@ STATIC int xfs_qm_dquot_walk( struct xfs_mount *mp, int type, - int (*execute)(struct xfs_dquot *dqp)) + int (*execute)(struct xfs_dquot *dqp, void *data), + void *data) { struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); @@ -97,7 +97,7 @@ restart: next_index = be32_to_cpu(dqp->q_core.d_id) + 1; - error = execute(batch[i]); + error = execute(batch[i], data); if (error == EAGAIN) { skipped++; continue; @@ -129,7 +129,8 @@ restart: */ STATIC int xfs_qm_dqpurge( - struct xfs_dquot *dqp) + struct xfs_dquot *dqp, + void *data) { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -153,21 +154,7 @@ xfs_qm_dqpurge( dqp->dq_flags |= XFS_DQ_FREEING; - /* - * If we're turning off quotas, we have to make sure that, for - * example, we don't delete quota disk blocks while dquots are - * in the process of getting written to those disk blocks. - * This dquot might well be on AIL, and we can't leave it there - * if we're turning off quotas. Basically, we need this flush - * lock, and are willing to block on it. - */ - if (!xfs_dqflock_nowait(dqp)) { - /* - * Block on the flush lock after nudging dquot buffer, - * if it is incore. - */ - xfs_dqflock_pushbuf_wait(dqp); - } + xfs_dqflock(dqp); /* * If we are turning this type of quotas off, we don't care @@ -175,16 +162,21 @@ xfs_qm_dqpurge( * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { - int error; + struct xfs_buf *bp = NULL; + int error; /* * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_qm_dqflush(dqp, SYNC_WAIT); - if (error) + error = xfs_qm_dqflush(dqp, &bp); + if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); + } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } xfs_dqflock(dqp); } @@ -226,11 +218,11 @@ xfs_qm_dqpurge_all( uint flags) { if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge); + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_GQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge); + xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge); + xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); } /* @@ -483,6 +475,23 @@ done: xfs_dqunlock(udq); } +static bool +xfs_qm_need_dqattach( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return false; + if (!XFS_IS_QUOTA_ON(mp)) + return false; + if (!XFS_NOT_DQATTACHED(mp, ip)) + return false; + if (ip->i_ino == mp->m_sb.sb_uquotino || + ip->i_ino == mp->m_sb.sb_gquotino) + return false; + return true; +} /* * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON @@ -500,11 +509,7 @@ xfs_qm_dqattach_locked( uint nquotas = 0; int error = 0; - if (!XFS_IS_QUOTA_RUNNING(mp) || - !XFS_IS_QUOTA_ON(mp) || - !XFS_NOT_DQATTACHED(mp, ip) || - ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + if (!xfs_qm_need_dqattach(ip)) return 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -575,6 +580,9 @@ xfs_qm_dqattach( { int error; + if (!xfs_qm_need_dqattach(ip)) + return 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqattach_locked(ip, flags); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -855,15 +863,16 @@ xfs_qm_reset_dqcounts( STATIC int xfs_qm_dqiter_bufs( - xfs_mount_t *mp, - xfs_dqid_t firstid, - xfs_fsblock_t bno, - xfs_filblks_t blkcnt, - uint flags) + struct xfs_mount *mp, + xfs_dqid_t firstid, + xfs_fsblock_t bno, + xfs_filblks_t blkcnt, + uint flags, + struct list_head *buffer_list) { - xfs_buf_t *bp; - int error; - int type; + struct xfs_buf *bp; + int error; + int type; ASSERT(blkcnt > 0); type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : @@ -887,7 +896,7 @@ xfs_qm_dqiter_bufs( break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); /* * goto the next block. @@ -895,6 +904,7 @@ xfs_qm_dqiter_bufs( bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } + return error; } @@ -904,11 +914,12 @@ xfs_qm_dqiter_bufs( */ STATIC int xfs_qm_dqiterate( - xfs_mount_t *mp, - xfs_inode_t *qip, - uint flags) + struct xfs_mount *mp, + struct xfs_inode *qip, + uint flags, + struct list_head *buffer_list) { - xfs_bmbt_irec_t *map; + struct xfs_bmbt_irec *map; int i, nmaps; /* number of map entries */ int error; /* return value */ xfs_fileoff_t lblkno; @@ -975,21 +986,17 @@ xfs_qm_dqiterate( * Iterate thru all the blks in the extent and * reset the counters of all the dquots inside them. */ - if ((error = xfs_qm_dqiter_bufs(mp, - firstid, - map[i].br_startblock, - map[i].br_blockcount, - flags))) { - break; - } + error = xfs_qm_dqiter_bufs(mp, firstid, + map[i].br_startblock, + map[i].br_blockcount, + flags, buffer_list); + if (error) + goto out; } - - if (error) - break; } while (nmaps > 0); +out: kmem_free(map); - return error; } @@ -1182,8 +1189,11 @@ error0: STATIC int xfs_qm_flush_one( - struct xfs_dquot *dqp) + struct xfs_dquot *dqp, + void *data) { + struct list_head *buffer_list = data; + struct xfs_buf *bp = NULL; int error = 0; xfs_dqlock(dqp); @@ -1192,11 +1202,13 @@ xfs_qm_flush_one( if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; - if (!xfs_dqflock_nowait(dqp)) - xfs_dqflock_pushbuf_wait(dqp); - - error = xfs_qm_dqflush(dqp, 0); + xfs_dqflock(dqp); + error = xfs_qm_dqflush(dqp, &bp); + if (error) + goto out_unlock; + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); out_unlock: xfs_dqunlock(dqp); return error; @@ -1215,6 +1227,7 @@ xfs_qm_quotacheck( size_t structsz; xfs_inode_t *uip, *gip; uint flags; + LIST_HEAD (buffer_list); count = INT_MAX; structsz = 1; @@ -1233,7 +1246,8 @@ xfs_qm_quotacheck( */ uip = mp->m_quotainfo->qi_uquotaip; if (uip) { - error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + &buffer_list); if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; @@ -1242,7 +1256,8 @@ xfs_qm_quotacheck( gip = mp->m_quotainfo->qi_gquotaip; if (gip) { error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA, + &buffer_list); if (error) goto error_return; flags |= XFS_OQUOTA_CHKD; @@ -1265,19 +1280,27 @@ xfs_qm_quotacheck( * We've made all the changes that we need to make incore. Flush them * down to disk buffers if everything was updated successfully. */ - if (XFS_IS_UQUOTA_ON(mp)) - error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one); + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + &buffer_list); + } if (XFS_IS_GQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one); + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + &buffer_list); if (!error) error = error2; } if (XFS_IS_PQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one); + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + &buffer_list); if (!error) error = error2; } + error2 = xfs_buf_delwri_submit(&buffer_list); + if (!error) + error = error2; + /* * We can get this error if we couldn't do a dquot allocation inside * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the @@ -1291,15 +1314,6 @@ xfs_qm_quotacheck( } /* - * We didn't log anything, because if we crashed, we'll have to - * start the quotacheck from scratch anyway. However, we must make - * sure that our dquot changes are secure before we put the - * quotacheck'd stamp on the superblock. So, here we do a synchronous - * flush. - */ - xfs_flush_buftarg(mp->m_ddev_targp, 1); - - /* * If one type of quotas is off, then it will lose its * quotachecked status, since we won't be doing accounting for * that type anymore. @@ -1308,6 +1322,13 @@ xfs_qm_quotacheck( mp->m_qflags |= flags; error_return: + while (!list_empty(&buffer_list)) { + struct xfs_buf *bp = + list_first_entry(&buffer_list, struct xfs_buf, b_list); + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } + if (error) { xfs_warn(mp, "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", @@ -1424,6 +1445,7 @@ xfs_qm_dqfree_one( STATIC void xfs_qm_dqreclaim_one( struct xfs_dquot *dqp, + struct list_head *buffer_list, struct list_head *dispose_list) { struct xfs_mount *mp = dqp->q_mount; @@ -1456,25 +1478,20 @@ xfs_qm_dqreclaim_one( if (!xfs_dqflock_nowait(dqp)) goto out_busy; - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + trace_xfs_dqreclaim_dirty(dqp); - /* - * We flush it delayed write, so don't bother releasing the - * freelist lock. - */ - error = xfs_qm_dqflush(dqp, 0); + error = xfs_qm_dqflush(dqp, &bp); if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); + goto out_busy; } + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); /* * Give the dquot another try on the freelist, as the * flushing will take some time. @@ -1518,8 +1535,10 @@ xfs_qm_shake( struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); int nr_to_scan = sc->nr_to_scan; + LIST_HEAD (buffer_list); LIST_HEAD (dispose_list); struct xfs_dquot *dqp; + int error; if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; @@ -1532,15 +1551,20 @@ xfs_qm_shake( break; dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, q_lru); - xfs_qm_dqreclaim_one(dqp, &dispose_list); + xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list); } mutex_unlock(&qi->qi_lru_lock); + error = xfs_buf_delwri_submit(&buffer_list); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + while (!list_empty(&dispose_list)) { dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); list_del_init(&dqp->q_lru); xfs_qm_dqfree_one(dqp); } + out: return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index e6986b5d80d..6b39115bf14 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index c4f396e437a..858a3b18611 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -22,7 +22,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 7e76f537abb..fed504fc299 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index e44ef7ee8ce..30ff5f401d2 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ca4f31534a0..92d4331cd4f 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -34,7 +33,6 @@ #include "xfs_rtalloc.h" #include "xfs_fsops.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_inode_item.h" #include "xfs_trans_space.h" #include "xfs_utils.h" @@ -1872,9 +1870,9 @@ xfs_growfs_rt( /* * Read in the last block of the device, make sure it exists. */ - bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) return EIO; xfs_buf_relse(bp); @@ -2219,9 +2217,9 @@ xfs_rtmount_init( (unsigned long long) mp->m_sb.sb_rblocks); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "realtime device size check failed"); return EIO; diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c deleted file mode 100644 index 597d044a09a..00000000000 --- a/fs/xfs/xfs_rw.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_error.h" -#include "xfs_rw.h" - -/* - * Force a shutdown of the filesystem instantly while keeping - * the filesystem consistent. We don't do an unmount here; just shutdown - * the shop, make sure that absolutely nothing persistent happens to - * this filesystem after this point. - */ -void -xfs_do_force_shutdown( - xfs_mount_t *mp, - int flags, - char *fname, - int lnnum) -{ - int logerror; - - logerror = flags & SHUTDOWN_LOG_IO_ERROR; - - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_notice(mp, - "%s(0x%x) called from line %d of file %s. Return address = 0x%p", - __func__, flags, lnnum, fname, __return_address); - } - /* - * No need to duplicate efforts. - */ - if (XFS_FORCED_SHUTDOWN(mp) && !logerror) - return; - - /* - * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't - * queue up anybody new on the log reservations, and wakes up - * everybody who's sleeping on log reservations to tell them - * the bad news. - */ - if (xfs_log_force_umount(mp, logerror)) - return; - - if (flags & SHUTDOWN_CORRUPT_INCORE) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, - "Corruption of in-memory data detected. Shutting down filesystem"); - if (XFS_ERRLEVEL_HIGH <= xfs_error_level) - xfs_stack_trace(); - } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); - } - } - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_alert(mp, - "Please umount the filesystem and rectify the problem(s)"); - } -} - -/* - * This isn't an absolute requirement, but it is - * just a good idea to call xfs_read_buf instead of - * directly doing a read_buf call. For one, we shouldn't - * be doing this disk read if we are in SHUTDOWN state anyway, - * so this stops that from happening. Secondly, this does all - * the error checking stuff and the brelse if appropriate for - * the caller, so the code can be a little leaner. - */ - -int -xfs_read_buf( - struct xfs_mount *mp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len, - uint flags, - xfs_buf_t **bpp) -{ - xfs_buf_t *bp; - int error; - - if (!flags) - flags = XBF_LOCK | XBF_MAPPED; - - bp = xfs_buf_read(target, blkno, len, flags); - if (!bp) - return XFS_ERROR(EIO); - error = bp->b_error; - if (!error && !XFS_FORCED_SHUTDOWN(mp)) { - *bpp = bp; - } else { - *bpp = NULL; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - } else { - error = XFS_ERROR(EIO); - } - if (bp) { - XFS_BUF_UNDONE(bp); - xfs_buf_stale(bp); - /* - * brelse clears B_ERROR and b_error - */ - xfs_buf_relse(bp); - } - } - return (error); -} - -/* - * helper function to extract extent size hint from inode - */ -xfs_extlen_t -xfs_get_extsz_hint( - struct xfs_inode *ip) -{ - if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) - return ip->i_d.di_extsize; - if (XFS_IS_REALTIME_INODE(ip)) - return ip->i_mount->m_sb.sb_rextsize; - return 0; -} diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h deleted file mode 100644 index bbdb9ad6a4b..00000000000 --- a/fs/xfs/xfs_rw.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_RW_H__ -#define __XFS_RW_H__ - -struct xfs_buf; -struct xfs_inode; -struct xfs_mount; - -/* - * Convert the given file system block to a disk block. - * We have to treat it differently based on whether the - * file is a real time file or not, because the bmap code - * does. - */ -static inline xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} - -/* - * Prototypes for functions in xfs_rw.c. - */ -extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, - xfs_daddr_t blkno, int len, uint flags, - struct xfs_buf **bpp); -extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); - -#endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index dab9a5f6dfd..0d9de41a715 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -17,7 +17,6 @@ */ #include "xfs.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -622,7 +621,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); } STATIC void @@ -773,8 +772,14 @@ xfs_init_mount_workqueues( if (!mp->m_unwritten_workqueue) goto out_destroy_data_iodone_queue; + mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_cil_workqueue) + goto out_destroy_unwritten; return 0; +out_destroy_unwritten: + destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: destroy_workqueue(mp->m_data_workqueue); out: @@ -785,6 +790,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); } @@ -926,7 +932,7 @@ xfs_fs_evict_inode( trace_xfs_evict_inode(ip); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); @@ -981,18 +987,9 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - xfs_syncd_stop(mp); - - /* - * Blow away any referenced inode in the filestreams cache. - * This can and will cause log traffic as inodes go inactive - * here. - */ xfs_filestream_unmount(mp); - - xfs_flush_buftarg(mp->m_ddev_targp, 1); - xfs_unmountfs(mp); + xfs_syncd_stop(mp); xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1072,7 +1069,7 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); - if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || + if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); @@ -1362,31 +1359,32 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_mountfs(mp); + error = xfs_syncd_init(mp); if (error) goto out_filestream_unmount; - error = xfs_syncd_init(mp); + error = xfs_mountfs(mp); if (error) - goto out_unmount; + goto out_syncd_stop; root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; - goto out_syncd_stop; + goto out_unmount; } if (is_bad_inode(root)) { error = EINVAL; - goto out_syncd_stop; + goto out_unmount; } sb->s_root = d_make_root(root); if (!sb->s_root) { error = ENOMEM; - goto out_syncd_stop; + goto out_unmount; } return 0; - + out_syncd_stop: + xfs_syncd_stop(mp); out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: @@ -1403,19 +1401,10 @@ out_destroy_workqueues: out: return -error; - out_syncd_stop: - xfs_syncd_stop(mp); out_unmount: - /* - * Blow away any referenced inode in the filestreams cache. - * This can and will cause log traffic as inodes go inactive - * here. - */ xfs_filestream_unmount(mp); - - xfs_flush_buftarg(mp->m_ddev_targp, 1); - xfs_unmountfs(mp); + xfs_syncd_stop(mp); goto out_free_sb; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 205ebcb34d9..c9d3409c5ca 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -241,45 +240,6 @@ xfs_sync_inode_data( return error; } -STATIC int -xfs_sync_inode_attr( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - int error = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_inode_clean(ip)) - goto out_unlock; - if (!xfs_iflock_nowait(ip)) { - if (!(flags & SYNC_WAIT)) - goto out_unlock; - xfs_iflock(ip); - } - - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - goto out_unlock; - } - - error = xfs_iflush(ip, flags); - - /* - * We don't want to try again on non-blocking flushes that can't run - * again immediately. If an inode really must be written, then that's - * what the SYNC_WAIT flag is for. - */ - if (error == EAGAIN) { - ASSERT(!(flags & SYNC_WAIT)); - error = 0; - } - - out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} - /* * Write out pagecache data for the whole filesystem. */ @@ -300,19 +260,6 @@ xfs_sync_data( return 0; } -/* - * Write out inode metadata (attributes) for the whole filesystem. - */ -STATIC int -xfs_sync_attr( - struct xfs_mount *mp, - int flags) -{ - ASSERT((flags & ~SYNC_WAIT) == 0); - - return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); -} - STATIC int xfs_sync_fsdata( struct xfs_mount *mp) @@ -350,7 +297,7 @@ xfs_sync_fsdata( * First stage of freeze - no writers will make progress now we are here, * so we flush delwri and delalloc buffers here, then wait for all I/O to * complete. Data is frozen at that point. Metadata is not frozen, - * transactions can still occur here so don't bother flushing the buftarg + * transactions can still occur here so don't bother emptying the AIL * because it'll just get dirty again. */ int @@ -365,47 +312,13 @@ xfs_quiesce_data( /* write superblock and hoover up shutdown errors */ error = xfs_sync_fsdata(mp); - /* make sure all delwri buffers are written out */ - xfs_flush_buftarg(mp->m_ddev_targp, 1); - /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) error2 = xfs_fs_log_dummy(mp); - /* flush data-only devices */ - if (mp->m_rtdev_targp) - xfs_flush_buftarg(mp->m_rtdev_targp, 1); - return error ? error : error2; } -STATIC void -xfs_quiesce_fs( - struct xfs_mount *mp) -{ - int count = 0, pincount; - - xfs_reclaim_inodes(mp, 0); - xfs_flush_buftarg(mp->m_ddev_targp, 0); - - /* - * This loop must run at least twice. The first instance of the loop - * will flush most meta data but that will generate more meta data - * (typically directory updates). Which then must be flushed and - * logged before we can write the unmount record. We also so sync - * reclaim of inodes to catch any that the above delwri flush skipped. - */ - do { - xfs_reclaim_inodes(mp, SYNC_WAIT); - xfs_sync_attr(mp, SYNC_WAIT); - pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (!pincount) { - delay(50); - count++; - } - } while (count < 2); -} - /* * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to @@ -421,8 +334,12 @@ xfs_quiesce_attr( while (atomic_read(&mp->m_active_trans) > 0) delay(100); - /* flush inodes and push all remaining buffers out to disk */ - xfs_quiesce_fs(mp); + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* flush all pending changes from the AIL */ + xfs_ail_push_all_sync(mp->m_ail); /* * Just warn here till VFS can correctly support @@ -436,7 +353,12 @@ xfs_quiesce_attr( xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); + + /* + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. + */ + xfs_ail_push_all_sync(mp->m_ail); } static void @@ -460,16 +382,27 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); - - /* start pushing all the metadata that is currently dirty */ - xfs_ail_push_all(mp->m_ail); + /* + * We shouldn't write/force the log if we are in the mount/unmount + * process or on a read only filesystem. The workqueue still needs to be + * active in both cases, however, because it is used for inode reclaim + * during these times. Use the s_umount semaphore to provide exclusion + * with unmount. + */ + if (down_read_trylock(&mp->m_super->s_umount)) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + /* dgc: errors ignored here */ + if (mp->m_super->s_frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently + * dirty */ + xfs_ail_push_all(mp->m_ail); + } + up_read(&mp->m_super->s_umount); } /* queue us up again */ @@ -488,14 +421,6 @@ xfs_syncd_queue_reclaim( struct xfs_mount *mp) { - /* - * We can have inodes enter reclaim after we've shut down the syncd - * workqueue during unmount, so don't allow reclaim work to be queued - * during unmount. - */ - if (!(mp->m_super->s_flags & MS_ACTIVE)) - return; - rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, @@ -564,7 +489,6 @@ xfs_syncd_init( INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); xfs_syncd_queue_sync(mp); - xfs_syncd_queue_reclaim(mp); return 0; } @@ -702,11 +626,8 @@ xfs_reclaim_inode_grab( } /* - * Inodes in different states need to be treated differently, and the return - * value of xfs_iflush is not sufficient to get this right. The following table - * lists the inode states and the reclaim actions necessary for non-blocking - * reclaim: - * + * Inodes in different states need to be treated differently. The following + * table lists the inode states and the reclaim actions necessary: * * inode state iflush ret required action * --------------- ---------- --------------- @@ -716,39 +637,31 @@ xfs_reclaim_inode_grab( * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue - * dirty, delwri ok 0 requeue - * dirty, delwri blocked EAGAIN requeue - * dirty, sync flush 0 reclaim + * dirty, async - requeue + * dirty, sync 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * - * As can be seen from the table, the return value of xfs_iflush() is not - * sufficient to correctly decide the reclaim action here. The checks in - * xfs_iflush() might look like duplicates, but they are not. - * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that - * the inode is clean. The clean inode check needs to be done before flushing - * the inode delwri otherwise we would loop forever requeuing clean inodes as - * we cannot tell apart a successful delwri flush and a clean inode from the - * return value of xfs_iflush(). + * the inode is clean. * - * Note that because the inode is flushed delayed write by background - * writeback, the flush lock may already be held here and waiting on it can - * result in very long latencies. Hence for sync reclaims, where we wait on the - * flush lock, the caller should push out delayed write inodes first before - * trying to reclaim them to minimise the amount of time spent waiting. For - * background relaim, we just requeue the inode for the next pass. + * Note that because the inode is flushed delayed write by AIL pushing, the + * flush lock may already be held here and waiting on it can result in very + * long latencies. Hence for sync reclaims, where we wait on the flush lock, + * the caller should push the AIL first before trying to reclaim inodes to + * minimise the amount of time spent waiting. For background relaim, we only + * bother to reclaim clean inodes anyway. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim - * pinned, delwri => requeue + * pinned, async => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim - * dirty, delwri => flush and requeue + * dirty, async => requeue * dirty, sync => flush, wait and reclaim */ STATIC int @@ -757,7 +670,8 @@ xfs_reclaim_inode( struct xfs_perag *pag, int sync_mode) { - int error; + struct xfs_buf *bp = NULL; + int error; restart: error = 0; @@ -765,17 +679,6 @@ restart: if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; - - /* - * If we only have a single dirty inode in a cluster there is - * a fair chance that the AIL push may have pushed it into - * the buffer, but xfsbufd won't touch it until 30 seconds - * from now, and thus we will lock up here. - * - * Promote the inode buffer to the front of the delwri list - * and wake up xfsbufd now. - */ - xfs_promote_inode(ip); xfs_iflock(ip); } @@ -783,13 +686,12 @@ restart: goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); + xfs_iflush_abort(ip, false); goto reclaim; } if (xfs_ipincount(ip)) { - if (!(sync_mode & SYNC_WAIT)) { - xfs_ifunlock(ip); - goto out; - } + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) @@ -798,60 +700,42 @@ restart: goto reclaim; /* + * Never flush out dirty data during non-blocking reclaim, as it would + * just contend with AIL pushing trying to do the same job. + */ + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + + /* * Now we have an inode that needs flushing. * - * We do a nonblocking flush here even if we are doing a SYNC_WAIT - * reclaim as we can deadlock with inode cluster removal. + * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the - * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_itobp() to get the cluster buffer will result + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_itobp() to get the cluster buffer would result * in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply - * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, - * just unlock the inode, back off and try again. Hopefully the next - * pass through will see the stale flag set on the inode. + * reclaim it. Hence if we get an EAGAIN error here, just unlock the + * inode, back off and try again. Hopefully the next pass through will + * see the stale flag set on the inode. */ - error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); - if (sync_mode & SYNC_WAIT) { - if (error == EAGAIN) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* backoff longer than in xfs_ifree_cluster */ - delay(2); - goto restart; - } - xfs_iflock(ip); - goto reclaim; + error = xfs_iflush(ip, &bp); + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; } - /* - * When we have to flush an inode but don't have SYNC_WAIT set, we - * flush the inode out using a delwri buffer and wait for the next - * call into reclaim to find it in a clean state instead of waiting for - * it now. We also don't return errors here - if the error is transient - * then the next reclaim pass will flush the inode, and if the error - * is permanent then the next sync reclaim will reclaim the inode and - * pass on the error. - */ - if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_warn(ip->i_mount, - "inode 0x%llx background reclaim flush failed with %d", - (long long)ip->i_ino, error); + if (!error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); } -out: - xfs_iflags_clear(ip, XFS_IRECLAIM); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * We could return EAGAIN here to make reclaim rescan the inode tree in - * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. - */ - return 0; + xfs_iflock(ip); reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -884,8 +768,21 @@ reclaim: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_inode_free(ip); - return error; + +out_ifunlock: + xfs_ifunlock(ip); +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and xfssyncd never goes back to the idle + * state. Instead, return 0 to let the next scheduled background reclaim + * attempt to reclaim the inode again. + */ + return 0; } /* diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 9010ce885e6..624bedd8135 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 06838c42b2a..7cf9d3529e5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -281,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, bno) - __field(size_t, buffer_length) + __field(int, nblks) __field(int, hold) __field(int, pincount) __field(unsigned, lockval) @@ -291,18 +291,18 @@ DECLARE_EVENT_CLASS(xfs_buf_class, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " "lock %d flags %s caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, - __entry->buffer_length, + __entry->nblks, __entry->hold, __entry->pincount, __entry->lockval, @@ -328,7 +328,7 @@ DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); -DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); +DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); @@ -362,7 +362,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->buffer_length = BBTOB(bp->b_length); __entry->flags = flags; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); @@ -406,7 +406,7 @@ TRACE_EVENT(xfs_buf_ioerror, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->buffer_length = BBTOB(bp->b_length); __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; @@ -450,7 +450,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_recur = bip->bli_recur; __entry->bli_refcount = atomic_read(&bip->bli_refcount); __entry->buf_bno = bip->bli_buf->b_bn; - __entry->buf_len = bip->bli_buf->b_buffer_length; + __entry->buf_len = BBTOB(bip->bli_buf->b_length); __entry->buf_flags = bip->bli_buf->b_flags; __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); @@ -486,12 +486,10 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); @@ -876,15 +874,30 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) ) +TRACE_EVENT(xfs_log_force, + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), + TP_ARGS(mp, lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->lsn = lsn; + ), + TP_printk("dev %d:%d lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn) +) + #define DEFINE_LOG_ITEM_EVENT(name) \ DEFINE_EVENT(xfs_log_item_class, name, \ TP_PROTO(struct xfs_log_item *lip), \ TP_ARGS(lip)) DEFINE_LOG_ITEM_EVENT(xfs_ail_push); -DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf); -DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); +DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); DECLARE_EVENT_CLASS(xfs_file_class, @@ -1145,7 +1158,7 @@ TRACE_EVENT(xfs_bunmap, ); -DECLARE_EVENT_CLASS(xfs_busy_class, +DECLARE_EVENT_CLASS(xfs_extent_busy_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), TP_ARGS(mp, agno, agbno, len), @@ -1168,17 +1181,17 @@ DECLARE_EVENT_CLASS(xfs_busy_class, __entry->len) ); #define DEFINE_BUSY_EVENT(name) \ -DEFINE_EVENT(xfs_busy_class, name, \ +DEFINE_EVENT(xfs_extent_busy_class, name, \ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ xfs_agblock_t agbno, xfs_extlen_t len), \ TP_ARGS(mp, agno, agbno, len)) -DEFINE_BUSY_EVENT(xfs_alloc_busy); -DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem); -DEFINE_BUSY_EVENT(xfs_alloc_busy_force); -DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse); -DEFINE_BUSY_EVENT(xfs_alloc_busy_clear); +DEFINE_BUSY_EVENT(xfs_extent_busy); +DEFINE_BUSY_EVENT(xfs_extent_busy_enomem); +DEFINE_BUSY_EVENT(xfs_extent_busy_force); +DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); +DEFINE_BUSY_EVENT(xfs_extent_busy_clear); -TRACE_EVENT(xfs_alloc_busy_trim, +TRACE_EVENT(xfs_extent_busy_trim, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, xfs_agblock_t tbno, xfs_extlen_t tlen), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 103b00c9000..fdf324508c5 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -19,9 +19,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -36,6 +34,7 @@ #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_bmap.h" #include "xfs_quota.h" #include "xfs_trans_priv.h" @@ -585,7 +584,7 @@ xfs_trans_t * _xfs_trans_alloc( xfs_mount_t *mp, uint type, - uint memflags) + xfs_km_flags_t memflags) { xfs_trans_t *tp; @@ -608,8 +607,8 @@ STATIC void xfs_trans_free( struct xfs_trans *tp) { - xfs_alloc_busy_sort(&tp->t_busy); - xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false); + xfs_extent_busy_sort(&tp->t_busy); + xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); xfs_trans_free_dqinfo(tp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f6118703f20..7c37b533aa8 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -345,11 +345,9 @@ struct xfs_item_ops { void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); void (*iop_unpin)(xfs_log_item_t *, int remove); - uint (*iop_trylock)(xfs_log_item_t *); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); - void (*iop_push)(xfs_log_item_t *); - bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); }; @@ -357,20 +355,18 @@ struct xfs_item_ops { #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) #define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) #define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) -#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) +#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list) #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) #define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) -#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) -#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) #define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) /* - * Return values for the IOP_TRYLOCK() routines. + * Return values for the IOP_PUSH() routines. */ -#define XFS_ITEM_SUCCESS 0 -#define XFS_ITEM_PINNED 1 -#define XFS_ITEM_LOCKED 2 -#define XFS_ITEM_PUSHBUF 3 +#define XFS_ITEM_SUCCESS 0 +#define XFS_ITEM_PINNED 1 +#define XFS_ITEM_LOCKED 2 +#define XFS_ITEM_FLUSHING 3 /* * This is the type of function which can be given to xfs_trans_callback() @@ -447,7 +443,7 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces. */ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); -xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint); +xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, uint, uint); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1dead07f092..9c514483e59 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -20,7 +20,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -79,7 +78,7 @@ xfs_ail_check( * Return a pointer to the first item in the AIL. If the AIL is empty, then * return NULL. */ -static xfs_log_item_t * +xfs_log_item_t * xfs_ail_min( struct xfs_ail *ailp) { @@ -364,30 +363,31 @@ xfsaild_push( xfs_log_item_t *lip; xfs_lsn_t lsn; xfs_lsn_t target; - long tout = 10; + long tout; int stuck = 0; + int flushing = 0; int count = 0; - int push_xfsbufd = 0; /* - * If last time we ran we encountered pinned items, force the log first - * and wait for it before pushing again. + * If we encountered pinned items or did not finish writing out all + * buffers the last time we ran, force the log first and wait for it + * before pushing again. */ - spin_lock(&ailp->xa_lock); - if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && - !list_empty(&ailp->xa_ail)) { + if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 && + (!list_empty_careful(&ailp->xa_buf_list) || + xfs_ail_min_lsn(ailp))) { ailp->xa_log_flush = 0; - spin_unlock(&ailp->xa_lock); + XFS_STATS_INC(xs_push_ail_flush); xfs_log_force(mp, XFS_LOG_SYNC); - spin_lock(&ailp->xa_lock); } - target = ailp->xa_target; + spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); - if (!lip || XFS_FORCED_SHUTDOWN(mp)) { + if (!lip) { /* - * AIL is empty or our push has reached the end. + * If the AIL is empty or our push has reached the end we are + * done now. */ xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); @@ -396,54 +396,42 @@ xfsaild_push( XFS_STATS_INC(xs_push_ail); - /* - * While the item we are looking at is below the given threshold - * try to flush it out. We'd like not to stop until we've at least - * tried to push on everything in the AIL with an LSN less than - * the given threshold. - * - * However, we will stop after a certain number of pushes and wait - * for a reduced timeout to fire before pushing further. This - * prevents use from spinning when we can't do anything or there is - * lots of contention on the AIL lists. - */ lsn = lip->li_lsn; + target = ailp->xa_target; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; + /* - * If we can lock the item without sleeping, unlock the AIL - * lock and flush the item. Then re-grab the AIL lock so we - * can look for the next item on the AIL. List changes are - * handled by the AIL lookup functions internally - * - * If we can't lock the item, either its holder will flush it - * or it is already being flushed or it is being relogged. In - * any of these case it is being taken care of and we can just - * skip to the next item in the list. + * Note that IOP_PUSH may unlock and reacquire the AIL lock. We + * rely on the AIL cursor implementation to be able to deal with + * the dropped lock. */ - lock_result = IOP_TRYLOCK(lip); - spin_unlock(&ailp->xa_lock); + lock_result = IOP_PUSH(lip, &ailp->xa_buf_list); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); trace_xfs_ail_push(lip); - IOP_PUSH(lip); ailp->xa_last_pushed_lsn = lsn; break; - case XFS_ITEM_PUSHBUF: - XFS_STATS_INC(xs_push_ail_pushbuf); - trace_xfs_ail_pushbuf(lip); - - if (!IOP_PUSHBUF(lip)) { - trace_xfs_ail_pushbuf_pinned(lip); - stuck++; - ailp->xa_log_flush++; - } else { - ailp->xa_last_pushed_lsn = lsn; - } - push_xfsbufd = 1; + case XFS_ITEM_FLUSHING: + /* + * The item or its backing buffer is already beeing + * flushed. The typical reason for that is that an + * inode buffer is locked because we already pushed the + * updates to it as part of inode clustering. + * + * We do not want to to stop flushing just because lots + * of items are already beeing flushed, but we need to + * re-try the flushing relatively soon if most of the + * AIL is beeing flushed. + */ + XFS_STATS_INC(xs_push_ail_flushing); + trace_xfs_ail_flushing(lip); + + flushing++; + ailp->xa_last_pushed_lsn = lsn; break; case XFS_ITEM_PINNED: @@ -453,28 +441,22 @@ xfsaild_push( stuck++; ailp->xa_log_flush++; break; - case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); trace_xfs_ail_locked(lip); + stuck++; break; - default: ASSERT(0); break; } - spin_lock(&ailp->xa_lock); - /* should we bother continuing? */ - if (XFS_FORCED_SHUTDOWN(mp)) - break; - ASSERT(mp->m_log); - count++; /* * Are there too many items we can't do anything with? + * * If we we are skipping too many items because we can't flush * them or they are already being flushed, we back off and * given them time to complete whatever operation is being @@ -496,42 +478,36 @@ xfsaild_push( xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); - if (push_xfsbufd) { - /* we've got delayed write buffers to flush */ - wake_up_process(mp->m_ddev_targp->bt_task); - } + if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) + ailp->xa_log_flush++; - /* assume we have more work to do in a short while */ + if (!count || XFS_LSN_CMP(lsn, target) >= 0) { out_done: - if (!count) { - /* We're past our target or empty, so idle */ - ailp->xa_last_pushed_lsn = 0; - ailp->xa_log_flush = 0; - - tout = 50; - } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* - * We reached the target so wait a bit longer for I/O to - * complete and remove pushed items from the AIL before we - * start the next scan from the start of the AIL. + * We reached the target or the AIL is empty, so wait a bit + * longer for I/O to complete and remove pushed items from the + * AIL before we start the next scan from the start of the AIL. */ tout = 50; ailp->xa_last_pushed_lsn = 0; - } else if ((stuck * 100) / count > 90) { + } else if (((stuck + flushing) * 100) / count > 90) { /* - * Either there is a lot of contention on the AIL or we - * are stuck due to operations in progress. "Stuck" in this - * case is defined as >90% of the items we tried to push - * were stuck. + * Either there is a lot of contention on the AIL or we are + * stuck due to operations in progress. "Stuck" in this case + * is defined as >90% of the items we tried to push were stuck. * * Backoff a bit more to allow some I/O to complete before - * restarting from the start of the AIL. This prevents us - * from spinning on the same items, and if they are pinned will - * all the restart to issue a log force to unpin the stuck - * items. + * restarting from the start of the AIL. This prevents us from + * spinning on the same items, and if they are pinned will all + * the restart to issue a log force to unpin the stuck items. */ tout = 20; ailp->xa_last_pushed_lsn = 0; + } else { + /* + * Assume we have more work to do in a short while. + */ + tout = 10; } return tout; @@ -544,6 +520,8 @@ xfsaild( struct xfs_ail *ailp = data; long tout = 0; /* milliseconds */ + current->flags |= PF_MEMALLOC; + while (!kthread_should_stop()) { if (tout && tout <= 20) __set_current_state(TASK_KILLABLE); @@ -611,6 +589,30 @@ xfs_ail_push_all( } /* + * Push out all items in the AIL immediately and wait until the AIL is empty. + */ +void +xfs_ail_push_all_sync( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip; + DEFINE_WAIT(wait); + + spin_lock(&ailp->xa_lock); + while ((lip = xfs_ail_max(ailp)) != NULL) { + prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE); + ailp->xa_target = lip->li_lsn; + wake_up_process(ailp->xa_task); + spin_unlock(&ailp->xa_lock); + schedule(); + spin_lock(&ailp->xa_lock); + } + spin_unlock(&ailp->xa_lock); + + finish_wait(&ailp->xa_empty, &wait); +} + +/* * xfs_trans_ail_update - bulk AIL insertion operation. * * @xfs_trans_ail_update takes an array of log items that all need to be @@ -667,11 +669,15 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - spin_unlock(&ailp->xa_lock); - if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { - xlog_assign_tail_lsn(ailp->xa_mount); + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + spin_unlock(&ailp->xa_lock); + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); } } @@ -700,7 +706,8 @@ void xfs_trans_ail_delete_bulk( struct xfs_ail *ailp, struct xfs_log_item **log_items, - int nr_items) __releases(ailp->xa_lock) + int nr_items, + int shutdown_type) __releases(ailp->xa_lock) { xfs_log_item_t *mlip; int mlip_changed = 0; @@ -718,7 +725,7 @@ xfs_trans_ail_delete_bulk( xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_force_shutdown(mp, shutdown_type); } return; } @@ -729,28 +736,20 @@ xfs_trans_ail_delete_bulk( if (mlip == lip) mlip_changed = 1; } - spin_unlock(&ailp->xa_lock); - if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { - xlog_assign_tail_lsn(ailp->xa_mount); + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + spin_unlock(&ailp->xa_lock); + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); } } -/* - * The active item list (AIL) is a doubly linked list of log - * items sorted by ascending lsn. The base of the list is - * a forw/back pointer pair embedded in the xfs mount structure. - * The base is initialized with both pointers pointing to the - * base. This case always needs to be distinguished, because - * the base has no lsn to look at. We almost always insert - * at the end of the list, so on inserts we search from the - * end of the list to find where the new item belongs. - */ - -/* - * Initialize the doubly linked list to point only to itself. - */ int xfs_trans_ail_init( xfs_mount_t *mp) @@ -765,6 +764,8 @@ xfs_trans_ail_init( INIT_LIST_HEAD(&ailp->xa_ail); INIT_LIST_HEAD(&ailp->xa_cursors); spin_lock_init(&ailp->xa_lock); + INIT_LIST_HEAD(&ailp->xa_buf_list); + init_waitqueue_head(&ailp->xa_empty); ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", ailp->xa_mount->m_fsname); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 1302d1d95a5..21c5a5e3700 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -33,7 +31,6 @@ #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_trace.h" /* @@ -56,7 +53,7 @@ xfs_trans_buf_item_match( if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && XFS_BUF_ADDR(blip->bli_buf) == blkno && - XFS_BUF_COUNT(blip->bli_buf) == len) + BBTOB(blip->bli_buf->b_length) == len) return blip->bli_buf; } @@ -141,15 +138,11 @@ xfs_trans_get_buf(xfs_trans_t *tp, xfs_buf_t *bp; xfs_buf_log_item_t *bip; - if (flags == 0) - flags = XBF_LOCK | XBF_MAPPED; - /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) - return xfs_buf_get(target_dev, blkno, len, - flags | XBF_DONT_BLOCK); + return xfs_buf_get(target_dev, blkno, len, flags); /* * If we find the buffer in the cache with this transaction @@ -165,14 +158,6 @@ xfs_trans_get_buf(xfs_trans_t *tp, XFS_BUF_DONE(bp); } - /* - * If the buffer is stale then it was binval'ed - * since last read. This doesn't matter since the - * caller isn't allowed to use the data anyway. - */ - else if (XFS_BUF_ISSTALE(bp)) - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - ASSERT(bp->b_transp == tp); bip = bp->b_fspriv; ASSERT(bip != NULL); @@ -182,15 +167,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, return (bp); } - /* - * We always specify the XBF_DONT_BLOCK flag within a transaction - * so that get_buf does not try to push out a delayed write buffer - * which might cause another transaction to take place (if the - * buffer was delayed alloc). Such recursive transactions can - * easily deadlock with our current transaction as well as cause - * us to run out of stack space. - */ - bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK); + bp = xfs_buf_get(target_dev, blkno, len, flags); if (bp == NULL) { return NULL; } @@ -282,14 +259,13 @@ xfs_trans_read_buf( xfs_buf_log_item_t *bip; int error; - if (flags == 0) - flags = XBF_LOCK | XBF_MAPPED; + *bpp = NULL; /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) { - bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); + bp = xfs_buf_read(target, blkno, len, flags); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -297,6 +273,8 @@ xfs_trans_read_buf( if (bp->b_error) { error = bp->b_error; xfs_buf_ioerror_alert(bp, __func__); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); return error; } @@ -371,15 +349,7 @@ xfs_trans_read_buf( return 0; } - /* - * We always specify the XBF_DONT_BLOCK flag within a transaction - * so that get_buf does not try to push out a delayed write buffer - * which might cause another transaction to take place (if the - * buffer was delayed alloc). Such recursive transactions can - * easily deadlock with our current transaction as well as cause - * us to run out of stack space. - */ - bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); + bp = xfs_buf_read(target, blkno, len, flags); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? @@ -418,19 +388,6 @@ xfs_trans_read_buf( return 0; shutdown_abort: - /* - * the theory here is that buffer is good but we're - * bailing out because the filesystem is being forcibly - * shut down. So we should leave the b_flags alone since - * the buffer's not staled and just get out. - */ -#if defined(DEBUG) - if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) - xfs_notice(mp, "about to pop assert, bp == 0x%p", bp); -#endif - ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) != - (XBF_STALE|XBF_DELWRI)); - trace_xfs_trans_read_buf_shut(bp, _RET_IP_); xfs_buf_relse(bp); *bpp = NULL; @@ -606,7 +563,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); + ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -626,8 +583,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; - xfs_buf_delwri_queue(bp); - trace_xfs_trans_log_buf(bip); /* @@ -651,22 +606,33 @@ xfs_trans_log_buf(xfs_trans_t *tp, /* - * This called to invalidate a buffer that is being used within - * a transaction. Typically this is because the blocks in the - * buffer are being freed, so we need to prevent it from being - * written out when we're done. Allowing it to be written again - * might overwrite data in the free blocks if they are reallocated - * to a file. + * Invalidate a buffer that is being used within a transaction. + * + * Typically this is because the blocks in the buffer are being freed, so we + * need to prevent it from being written out when we're done. Allowing it + * to be written again might overwrite data in the free blocks if they are + * reallocated to a file. + * + * We prevent the buffer from being written out by marking it stale. We can't + * get rid of the buf log item at this point because the buffer may still be + * pinned by another transaction. If that is the case, then we'll wait until + * the buffer is committed to disk for the last time (we can tell by the ref + * count) and free it in xfs_buf_item_unpin(). Until that happens we will + * keep the buffer locked so that the buffer and buf log item are not reused. + * + * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log + * the buf item. This will be used at recovery time to determine that copies + * of the buffer in the log before this should not be replayed. * - * We prevent the buffer from being written out by clearing the - * B_DELWRI flag. We can't always - * get rid of the buf log item at this point, though, because - * the buffer may still be pinned by another transaction. If that - * is the case, then we'll wait until the buffer is committed to - * disk for the last time (we can tell by the ref count) and - * free it in xfs_buf_item_unpin(). Until it is cleaned up we - * will keep the buffer locked so that the buffer and buf log item - * are not reused. + * We mark the item descriptor and the transaction dirty so that we'll hold + * the buffer until after the commit. + * + * Since we're invalidating the buffer, we also clear the state about which + * parts of the buffer have been logged. We also clear the flag indicating + * that this is an inode buffer since the data in the buffer will no longer + * be valid. + * + * We set the stale bit in the buffer as well since we're getting rid of it. */ void xfs_trans_binval( @@ -686,7 +652,6 @@ xfs_trans_binval( * If the buffer is already invalidated, then * just return. */ - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); @@ -696,27 +661,8 @@ xfs_trans_binval( return; } - /* - * Clear the dirty bit in the buffer and set the STALE flag - * in the buf log item. The STALE flag will be used in - * xfs_buf_item_unpin() to determine if it should clean up - * when the last reference to the buf item is given up. - * We set the XFS_BLF_CANCEL flag in the buf log format structure - * and log the buf item. This will be used at recovery time - * to determine that copies of the buffer in the log before - * this should not be replayed. - * We mark the item descriptor and the transaction dirty so - * that we'll hold the buffer until after the commit. - * - * Since we're invalidating the buffer, we also clear the state - * about which parts of the buffer have been logged. We also - * clear the flag indicating that this is an inode buffer since - * the data in the buffer will no longer be valid. - * - * We set the stale bit in the buffer as well since we're getting - * rid of it. - */ xfs_buf_stale(bp); + bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 279099717ed..bcb60542fcf 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index f7590f5bade..8d71b16ecca 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 7a7442c03f2..d2eee20d5f5 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 8ab2ced415f..fb62377d1cb 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -71,6 +71,8 @@ struct xfs_ail { spinlock_t xa_lock; xfs_lsn_t xa_last_pushed_lsn; int xa_log_flush; + struct list_head xa_buf_list; + wait_queue_head_t xa_empty; }; /* @@ -90,18 +92,22 @@ xfs_trans_ail_update( } void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, - struct xfs_log_item **log_items, int nr_items) + struct xfs_log_item **log_items, int nr_items, + int shutdown_type) __releases(ailp->xa_lock); static inline void xfs_trans_ail_delete( struct xfs_ail *ailp, - xfs_log_item_t *lip) __releases(ailp->xa_lock) + xfs_log_item_t *lip, + int shutdown_type) __releases(ailp->xa_lock) { - xfs_trans_ail_delete_bulk(ailp, &lip, 1); + xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); } void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 65584b55607..398cf681d02 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -57,6 +57,7 @@ typedef __uint64_t __psunsigned_t; #endif /* __KERNEL__ */ typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ typedef __uint32_t xfs_agnumber_t; /* allocation group number */ typedef __int32_t xfs_extnum_t; /* # of extents in a file */ @@ -101,6 +102,7 @@ typedef __uint64_t xfs_fileoff_t; /* block number in a file */ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ + /* * Null values for the types. */ @@ -120,6 +122,9 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ #define NULLCOMMITLSN ((xfs_lsn_t)-1) +#define NULLFSINO ((xfs_ino_t)-1) +#define NULLAGINO ((xfs_agino_t)-1) + /* * Max values for extlen, extnum, aextnum. */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 79c05ac85bf..4e5b9ad5cb9 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 64981d7e737..b6a82d817a8 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -21,7 +21,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -39,7 +38,6 @@ #include "xfs_bmap.h" #include "xfs_acl.h" #include "xfs_attr.h" -#include "xfs_rw.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_utils.h" @@ -81,8 +79,7 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), - XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -1919,7 +1916,7 @@ xfs_alloc_file_space( error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); @@ -1966,7 +1963,7 @@ xfs_zero_remaining_bytes( bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, - mp->m_sb.sb_blocksize, XBF_DONT_BLOCK); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) return XFS_ERROR(ENOMEM); @@ -2315,17 +2312,33 @@ xfs_change_file_space( case XFS_IOC_ALLOCSP64: case XFS_IOC_FREESP: case XFS_IOC_FREESP64: + /* + * These operations actually do IO when extending the file, but + * the allocation is done seperately to the zeroing that is + * done. This set of operations need to be serialised against + * other IO operations, such as truncate and buffered IO. We + * need to take the IOLOCK here to serialise the allocation and + * zeroing IO to prevent other IOLOCK holders (e.g. getbmap, + * truncate, direct IO) from racing against the transient + * allocated but not written state we can have here. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); if (startoffset > fsize) { error = xfs_alloc_file_space(ip, fsize, - startoffset - fsize, 0, attr_flags); - if (error) + startoffset - fsize, 0, + attr_flags | XFS_ATTR_NOLOCK); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); break; + } } iattr.ia_valid = ATTR_SIZE; iattr.ia_size = startoffset; - error = xfs_setattr_size(ip, &iattr, attr_flags); + error = xfs_setattr_size(ip, &iattr, + attr_flags | XFS_ATTR_NOLOCK); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); if (error) return error; |