From 408cc4e97a3ccd172d2d676e4b585badf439271b Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 20 Sep 2012 13:16:45 -0500 Subject: xfs: zero allocation_args on the kernel stack Zero the kernel stack space that makes up the xfs_alloc_arg structures. Signed-off-by: Mark Tinguely Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs/xfs_alloc.c') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4f33c32affe..0287f3b1b50 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1866,6 +1866,7 @@ xfs_alloc_fix_freelist( /* * Initialize the args structure. */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; -- cgit v1.2.3-70-g09d2 From 326c03555b914ff153ba5b40df87fd6e28e7e367 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:58 +1000 Subject: xfs: introduce XFS_BMAPI_STACK_SWITCH Certain allocation paths through xfs_bmapi_write() are in situations where we have limited stack available. These are almost always in the buffered IO writeback path when convertion delayed allocation extents to real extents. The current stack switch occurs for userdata allocations, which means we also do stack switches for preallocation, direct IO and unwritten extent conversion, even those these call chains have never been implicated in a stack overrun. Hence, let's target just the single stack overun offended for stack switches. To do that, introduce a XFS_BMAPI_STACK_SWITCH flag that the caller can pass xfs_bmapi_write() to indicate it should switch stacks if it needs to do allocation. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_alloc.h | 1 + fs/xfs/xfs_bmap.c | 4 ++++ fs/xfs/xfs_bmap.h | 5 ++++- fs/xfs/xfs_iomap.c | 4 +++- 5 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs/xfs/xfs_alloc.c') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0287f3b1b50..43f791bcd8b 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2447,7 +2447,7 @@ xfs_alloc_vextent( { DECLARE_COMPLETION_ONSTACK(done); - if (!args->userdata) + if (!args->stack_switch) return __xfs_alloc_vextent(args); diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 93be4a667ca..ef7d4885dc2 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -123,6 +123,7 @@ typedef struct xfs_alloc_arg { struct completion *done; struct work_struct work; int result; + char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index e1545ec2f7d..91259554df8 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,6 +2441,7 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; + args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4675,6 +4676,9 @@ xfs_bmapi_allocate( return error; } + if (flags & XFS_BMAPI_STACK_SWITCH) + bma->stack_switch = 1; + error = xfs_bmap_alloc(bma); if (error) return error; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 803b56d7ce1..b68c598034c 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,6 +77,7 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -85,7 +86,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } static inline int xfs_bmapi_aflag(int w) @@ -133,6 +135,7 @@ typedef struct xfs_bmalloca { char userdata;/* set if is user data */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ + char stack_switch; } xfs_bmalloca_t; /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 973dff6ad93..7f537663365 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -584,7 +584,9 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, 1, + count_fsb, + XFS_BMAPI_STACK_SWITCH, + &first_block, 1, imap, &nimaps, &free_list); if (error) goto trans_cancel; -- cgit v1.2.3-70-g09d2 From 1f3c785c3adb7d2b109ec7c8f10081d1294b03d3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:59 +1000 Subject: xfs: move allocation stack switch up to xfs_bmapi_allocate Switching stacks are xfs_alloc_vextent can cause deadlocks when we run out of worker threads on the allocation workqueue. This can occur because xfs_bmap_btalloc can make multiple calls to xfs_alloc_vextent() and even if xfs_alloc_vextent() fails it can return with the AGF locked in the current allocation transaction. If we then need to make another allocation, and all the allocation worker contexts are exhausted because the are blocked waiting for the AGF lock, holder of the AGF cannot get it's xfs-alloc_vextent work completed to release the AGF. Hence allocation effectively deadlocks. To avoid this, move the stack switch one layer up to xfs_bmapi_allocate() so that all of the allocation attempts in a single switched stack transaction occur in a single worker context. This avoids the problem of an allocation being blocked waiting for a worker thread whilst holding the AGF. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 42 +------------------------------------- fs/xfs/xfs_alloc.h | 4 ---- fs/xfs/xfs_bmap.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_bmap.h | 4 ++++ 4 files changed, 54 insertions(+), 56 deletions(-) (limited to 'fs/xfs/xfs_alloc.c') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 43f791bcd8b..335206a9c69 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2208,7 +2208,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -__xfs_alloc_vextent( +xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2418,46 +2418,6 @@ error0: return error; } -static void -xfs_alloc_vextent_worker( - struct work_struct *work) -{ - struct xfs_alloc_arg *args = container_of(work, - struct xfs_alloc_arg, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_alloc_vextent(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Data allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Metadata - * requests, OTOH, are generally from low stack usage paths, so avoid the - * context switch overhead here. - */ -int -xfs_alloc_vextent( - struct xfs_alloc_arg *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_alloc_vextent(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index ef7d4885dc2..feacb061bab 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -120,10 +120,6 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ - struct completion *done; - struct work_struct work; - int result; - char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 91259554df8..83d0cf3df93 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,7 +2441,6 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; - args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4620,12 +4619,11 @@ xfs_bmapi_delay( STATIC int -xfs_bmapi_allocate( - struct xfs_bmalloca *bma, - int flags) +__xfs_bmapi_allocate( + struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; @@ -4658,25 +4656,25 @@ xfs_bmapi_allocate( * Indicate if this is the first user data in the file, or just any * user data. */ - if (!(flags & XFS_BMAPI_METADATA)) { + if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; } - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; /* * Only want to do the alignment at the eof if it is userdata and * allocation length is larger than a stripe unit. */ if (mp->m_dalign && bma->length >= mp->m_dalign && - !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; } - if (flags & XFS_BMAPI_STACK_SWITCH) + if (bma->flags & XFS_BMAPI_STACK_SWITCH) bma->stack_switch = 1; error = xfs_bmap_alloc(bma); @@ -4713,7 +4711,7 @@ xfs_bmapi_allocate( * A wasdelay extent has been initialized, so shouldn't be flagged * as unwritten. */ - if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4741,6 +4739,45 @@ xfs_bmapi_allocate( return 0; } +static void +xfs_bmapi_allocate_worker( + struct work_struct *work) +{ + struct xfs_bmalloca *args = container_of(work, + struct xfs_bmalloca, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_bmapi_allocate(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Some allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Otherwise just + * call directly to avoid the context switch overhead here. + */ +int +xfs_bmapi_allocate( + struct xfs_bmalloca *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->stack_switch) + return __xfs_bmapi_allocate(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -4926,6 +4963,7 @@ xfs_bmapi_write( bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; bma.offset = bno; + bma.flags = flags; /* * There's a 32/64 bit type mismatch between the @@ -4941,7 +4979,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(bma.length > 0); - error = xfs_bmapi_allocate(&bma, flags); + error = xfs_bmapi_allocate(&bma); if (error) goto error0; if (bma.blkno == NULLFSBLOCK) diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index b68c598034c..5f469c3516e 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -136,6 +136,10 @@ typedef struct xfs_bmalloca { char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ char stack_switch; + int flags; + struct completion *done; + struct work_struct work; + int result; } xfs_bmalloca_t; /* -- cgit v1.2.3-70-g09d2