From c999a223c2f0d31c64ef7379814cea1378b2b800 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 22 Mar 2012 05:15:07 +0000 Subject: xfs: introduce an allocation workqueue We currently have significant issues with the amount of stack that allocation in XFS uses, especially in the writeback path. We can easily consume 4k of stack between mapping the page, manipulating the bmap btree and allocating blocks from the free list. Not to mention btree block readahead and other functionality that issues IO in the allocation path. As a result, we can no longer fit allocation in the writeback path in the stack space provided on x86_64. To alleviate this problem, introduce an allocation workqueue and move all allocations to a seperate context. This can be easily added as an interposing layer into xfs_alloc_vextent(), which takes a single argument structure and does not return until the allocation is complete or has failed. To do this, add a work structure and a completion to the allocation args structure. This allows xfs_alloc_vextent to queue the args onto the workqueue and wait for it to be completed by the worker. This can be done completely transparently to the caller. The worker function needs to ensure that it sets and clears the PF_TRANS flag appropriately as it is being run in an active transaction context. Work can also be queued in a memory reclaim context, so a rescuer is needed for the workqueue. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'fs/xfs/xfs_alloc.c') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index ce84ffd0264..31e90335b83 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -35,6 +35,7 @@ #include "xfs_error.h" #include "xfs_trace.h" +struct workqueue_struct *xfs_alloc_wq; #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) @@ -2207,7 +2208,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -xfs_alloc_vextent( +__xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2417,6 +2418,37 @@ error0: return error; } +static void +xfs_alloc_vextent_worker( + struct work_struct *work) +{ + struct xfs_alloc_arg *args = container_of(work, + struct xfs_alloc_arg, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_alloc_vextent(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + + +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + DECLARE_COMPLETION_ONSTACK(done); + + args->done = &done; + INIT_WORK(&args->work, xfs_alloc_vextent_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent -- cgit v1.2.3-70-g09d2 From a66d636385d621e98a915233250356c394a437de Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 22 Mar 2012 05:15:12 +0000 Subject: xfs: fix fstrim offset calculations xfs_ioc_fstrim() doesn't treat the incoming offset and length correctly. It treats them as a filesystem block address, rather than a disk address. This is wrong because the range passed in is a linear representation, while the filesystem block address notation is a sparse representation. Hence we cannot convert the range direct to filesystem block units and then use that for calculating the range to trim. While this sounds dangerous, the problem is limited to calculating what AGs need to be trimmed. The code that calcuates the actual ranges to trim gets the right result (i.e. only ever discards free space), even though it uses the wrong ranges to limit what is trimmed. Hence this is not a bug that endangers user data. Fix this by treating the range as a disk address range and use the appropriate functions to convert the range into the desired formats for calculations. Further, fix the first free extent lookup (the longest) to actually find the largest free extent. Currently this lookup uses a <= lookup, which results in finding the extent to the left of the largest because we can never get an exact match on the largest extent. This is due to the fact that while we know it's size, we don't know it's location and so the exact match fails and we move one record to the left to get the next largest extent. Instead, use a >= search so that the lookup returns the largest extent regardless of the fact we don't get an exact match on it. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_alloc.h | 7 ++++++ fs/xfs/xfs_discard.c | 61 ++++++++++++++++++++++++++++++++-------------------- 3 files changed, 46 insertions(+), 24 deletions(-) (limited to 'fs/xfs/xfs_alloc.c') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 31e90335b83..0f0df2759b0 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -69,7 +69,7 @@ xfs_alloc_lookup_eq( * Lookup the first record greater than or equal to [bno, len] * in the btree given by cur. */ -STATIC int /* error */ +int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index ab5d0fd2f53..3a7e7d8f8de 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -248,6 +248,13 @@ xfs_alloc_lookup_le( xfs_extlen_t len, /* length of extent */ int *stat); /* success/failure */ +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + int /* error */ xfs_alloc_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 286a051f12c..1ad3a4b8ca4 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -37,9 +37,9 @@ STATIC int xfs_trim_extents( struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_fsblock_t start, - xfs_fsblock_t end, - xfs_fsblock_t minlen, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, __uint64_t *blocks_trimmed) { struct block_device *bdev = mp->m_ddev_targp->bt_bdev; @@ -67,7 +67,7 @@ xfs_trim_extents( /* * Look up the longest btree in the AGF and start with it. */ - error = xfs_alloc_lookup_le(cur, 0, + error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); if (error) goto out_del_cursor; @@ -77,8 +77,10 @@ xfs_trim_extents( * enough to be worth discarding. */ while (i) { - xfs_agblock_t fbno; - xfs_extlen_t flen; + xfs_agblock_t fbno; + xfs_extlen_t flen; + xfs_daddr_t dbno; + xfs_extlen_t dlen; error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) @@ -86,10 +88,18 @@ xfs_trim_extents( XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + /* + * use daddr format for all range/len calculations as that is + * the format the range/len variables are supplied in by + * userspace. + */ + dbno = XFS_AGB_TO_DADDR(mp, agno, fbno); + dlen = XFS_FSB_TO_BB(mp, flen); + /* * Too small? Give up. */ - if (flen < minlen) { + if (dlen < minlen) { trace_xfs_discard_toosmall(mp, agno, fbno, flen); goto out_del_cursor; } @@ -99,8 +109,7 @@ xfs_trim_extents( * supposed to discard skip it. Do not bother to trim * down partially overlapping ranges for now. */ - if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || - XFS_AGB_TO_FSB(mp, agno, fbno) > end) { + if (dbno + dlen < start || dbno > end) { trace_xfs_discard_exclude(mp, agno, fbno, flen); goto next_extent; } @@ -115,10 +124,7 @@ xfs_trim_extents( } trace_xfs_discard_extent(mp, agno, fbno, flen); - error = -blkdev_issue_discard(bdev, - XFS_AGB_TO_DADDR(mp, agno, fbno), - XFS_FSB_TO_BB(mp, flen), - GFP_NOFS, 0); + error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); if (error) goto out_del_cursor; *blocks_trimmed += flen; @@ -137,6 +143,15 @@ out_put_perag: return error; } +/* + * trim a range of the filesystem. + * + * Note: the parameters passed from userspace are byte ranges into the + * filesystem which does not match to the format we use for filesystem block + * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format + * is a linear address range. Hence we need to use DADDR based conversions and + * comparisons for determining the correct offset and regions to trim. + */ int xfs_ioc_trim( struct xfs_mount *mp, @@ -145,7 +160,7 @@ xfs_ioc_trim( struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; - xfs_fsblock_t start, end, minlen; + xfs_daddr_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; __uint64_t blocks_trimmed = 0; int error, last_error = 0; @@ -159,22 +174,22 @@ xfs_ioc_trim( /* * Truncating down the len isn't actually quite correct, but using - * XFS_B_TO_FSB would mean we trivially get overflows for values + * BBTOB would mean we trivially get overflows for values * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default * used by the fstrim application. In the end it really doesn't * matter as trimming blocks is an advisory interface. */ - start = XFS_B_TO_FSBT(mp, range.start); - end = start + XFS_B_TO_FSBT(mp, range.len) - 1; - minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); + start = BTOBB(range.start); + end = start + BTOBBT(range.len) - 1; + minlen = BTOBB(max_t(u64, granularity, range.minlen)); - if (start >= mp->m_sb.sb_dblocks) + if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks) return -XFS_ERROR(EINVAL); - if (end > mp->m_sb.sb_dblocks - 1) - end = mp->m_sb.sb_dblocks - 1; + if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) + end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; - start_agno = XFS_FSB_TO_AGNO(mp, start); - end_agno = XFS_FSB_TO_AGNO(mp, end); + start_agno = xfs_daddr_to_agno(mp, start); + end_agno = xfs_daddr_to_agno(mp, end); for (agno = start_agno; agno <= end_agno; agno++) { error = -xfs_trim_extents(mp, agno, start, end, minlen, -- cgit v1.2.3-70-g09d2