From f07c225036358038bf8a64f75351f10cdca2fb22 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Thu, 28 Sep 2006 10:52:15 +1000 Subject: [XFS] Improve xfsbufd delayed write submission patterns, after blktrace analysis. Under a sequential create+allocate workload, blktrace reported backward writes being issued by xfsbufd, and frequent inappropriate queue unplugs. We now insert at the tail when moving from the delwri lists to the temp lists, which maintains correct ordering, and we avoid unplugging queues deep in the submit paths when we'd shortly do it at a higher level anyway. blktrace now reports much healthier write patterns from xfsbufd for this workload (and likely many others). SGI-PV: 954310 SGI-Modid: xfs-linux-melb:xfs-kern:26396a Signed-off-by: Nathan Scott Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_buf.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'fs/xfs/linux-2.6/xfs_buf.c') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 2af528dcfb0..f13503508c4 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -1681,6 +1681,7 @@ xfsbufd( xfs_buf_t *bp, *n; struct list_head *dwq = &target->bt_delwrite_queue; spinlock_t *dwlk = &target->bt_delwrite_lock; + int count; current->flags |= PF_MEMALLOC; @@ -1696,6 +1697,7 @@ xfsbufd( schedule_timeout_interruptible( xfs_buf_timer_centisecs * msecs_to_jiffies(10)); + count = 0; age = xfs_buf_age_centisecs * msecs_to_jiffies(10); spin_lock(dwlk); list_for_each_entry_safe(bp, n, dwq, b_list) { @@ -1711,9 +1713,11 @@ xfsbufd( break; } - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| + _XBF_RUN_QUEUES); bp->b_flags |= XBF_WRITE; - list_move(&bp->b_list, &tmp); + list_move_tail(&bp->b_list, &tmp); + count++; } } spin_unlock(dwlk); @@ -1724,12 +1728,12 @@ xfsbufd( list_del_init(&bp->b_list); xfs_buf_iostrategy(bp); - - blk_run_address_space(target->bt_mapping); } if (as_list_len > 0) purge_addresses(); + if (count) + blk_run_address_space(target->bt_mapping); clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); } while (!kthread_should_stop()); @@ -1767,7 +1771,7 @@ xfs_flush_buftarg( continue; } - list_move(&bp->b_list, &tmp); + list_move_tail(&bp->b_list, &tmp); } spin_unlock(dwlk); @@ -1776,7 +1780,7 @@ xfs_flush_buftarg( */ list_for_each_entry_safe(bp, n, &tmp, b_list) { xfs_buf_lock(bp); - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|_XBF_RUN_QUEUES); bp->b_flags |= XBF_WRITE; if (wait) bp->b_flags &= ~XBF_ASYNC; @@ -1786,6 +1790,9 @@ xfs_flush_buftarg( xfs_buf_iostrategy(bp); } + if (wait) + blk_run_address_space(target->bt_mapping); + /* * Remaining list items must be flushed before returning */ @@ -1797,9 +1804,6 @@ xfs_flush_buftarg( xfs_buf_relse(bp); } - if (wait) - blk_run_address_space(target->bt_mapping); - return pincount; } -- cgit v1.2.3-70-g09d2 From 51bdd70681e247184b81c2de61dbc26154511155 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Thu, 28 Sep 2006 11:01:57 +1000 Subject: [XFS] When issuing metadata readahead, submit bio with READA not READ. SGI-PV: 944409 SGI-Modid: xfs-linux-melb:xfs-kern:26603a Signed-off-by: Nathan Scott Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_buf.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/xfs/linux-2.6/xfs_buf.c') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index f13503508c4..247adced6e1 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1156,16 +1156,16 @@ _xfs_buf_ioapply( total_nr_pages = bp->b_page_count; map_i = 0; - if (bp->b_flags & _XBF_RUN_QUEUES) { - bp->b_flags &= ~_XBF_RUN_QUEUES; - rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC; - } else { - rw = (bp->b_flags & XBF_READ) ? READ : WRITE; - } - if (bp->b_flags & XBF_ORDERED) { ASSERT(!(bp->b_flags & XBF_READ)); rw = WRITE_BARRIER; + } else if (bp->b_flags & _XBF_RUN_QUEUES) { + ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); + bp->b_flags &= ~_XBF_RUN_QUEUES; + rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; + } else { + rw = (bp->b_flags & XBF_WRITE) ? WRITE : + (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; } /* Special code path for reading a sub page size buffer in -- -- cgit v1.2.3-70-g09d2 From efb8ad7e9431a430a75d44288614cf6047ff4baa Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Thu, 28 Sep 2006 11:03:05 +1000 Subject: [XFS] Add a debug flag for allocations which are known to be larger than one page. SGI-PV: 955302 SGI-Modid: xfs-linux-melb:xfs-kern:26800a Signed-off-by: Nathan Scott Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/kmem.c | 8 ++++++++ fs/xfs/linux-2.6/kmem.h | 3 ++- fs/xfs/linux-2.6/xfs_buf.c | 2 +- fs/xfs/quota/xfs_qm.c | 6 +++--- fs/xfs/support/ktrace.c | 2 +- fs/xfs/xfs_iget.c | 4 ++-- fs/xfs/xfs_log.c | 2 +- 7 files changed, 18 insertions(+), 9 deletions(-) (limited to 'fs/xfs/linux-2.6/xfs_buf.c') diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index aba7fcf881a..f77fe5c8fcc 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -34,6 +34,14 @@ kmem_alloc(size_t size, unsigned int __nocast flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; +#ifdef DEBUG + if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) { + printk(KERN_WARNING "Large %s attempt, size=%ld\n", + __FUNCTION__, (long)size); + dump_stack(); + } +#endif + do { if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) ptr = kmalloc(size, lflags); diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index 0e8293c5a32..6d24274fb3c 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -30,6 +30,7 @@ #define KM_NOSLEEP 0x0002u #define KM_NOFS 0x0004u #define KM_MAYFAIL 0x0008u +#define KM_LARGE 0x0010u /* * We use a special process flag to avoid recursive callbacks into @@ -41,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); if (flags & KM_NOSLEEP) { lflags = GFP_ATOMIC | __GFP_NOWARN; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 247adced6e1..58b6599de61 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -768,7 +768,7 @@ xfs_buf_get_noaddr( _xfs_buf_initialize(bp, target, 0, len, 0); try_again: - data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL); + data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL | KM_LARGE); if (unlikely(data == NULL)) goto fail_free_buf; diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index e23e45535c4..3f86c7c0464 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -112,17 +112,17 @@ xfs_Gqm_init(void) { xfs_dqhash_t *udqhash, *gdqhash; xfs_qm_t *xqm; - uint i, hsize, flags = KM_SLEEP | KM_MAYFAIL; + uint i, hsize, flags = KM_SLEEP | KM_MAYFAIL | KM_LARGE; /* * Initialize the dquot hash tables. */ hsize = XFS_QM_HASHSIZE_HIGH; - while (!(udqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), flags))) { + while (!(udqhash = kmem_zalloc(hsize * sizeof(*udqhash), flags))) { if ((hsize >>= 1) <= XFS_QM_HASHSIZE_LOW) flags = KM_SLEEP; } - gdqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), KM_SLEEP); + gdqhash = kmem_zalloc(hsize * sizeof(*gdqhash), KM_SLEEP | KM_LARGE); ndquot = hsize << 8; xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c index addf5a7ea06..5cf2e86caa7 100644 --- a/fs/xfs/support/ktrace.c +++ b/fs/xfs/support/ktrace.c @@ -75,7 +75,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep) sleep); } else { ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)), - sleep); + sleep | KM_LARGE); } if (ktep == NULL) { diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 109000a4bc8..30eebc2fd90 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -50,7 +50,7 @@ void xfs_ihash_init(xfs_mount_t *mp) { __uint64_t icount; - uint i, flags = KM_SLEEP | KM_MAYFAIL; + uint i, flags = KM_SLEEP | KM_MAYFAIL | KM_LARGE; if (!mp->m_ihsize) { icount = mp->m_maxicount ? mp->m_maxicount : @@ -95,7 +95,7 @@ xfs_chash_init(xfs_mount_t *mp) mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize); mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize * sizeof(xfs_chash_t), - KM_SLEEP); + KM_SLEEP | KM_LARGE); for (i = 0; i < mp->m_chsize; i++) { spinlock_init(&mp->m_chash[i].ch_lock,"xfshash"); } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2a46919110f..ac999789a44 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1196,7 +1196,7 @@ xlog_alloc_log(xfs_mount_t *mp, kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); iclog = *iclogp; iclog->hic_data = (xlog_in_core_2_t *) - kmem_zalloc(iclogsize, KM_SLEEP); + kmem_zalloc(iclogsize, KM_SLEEP | KM_LARGE); iclog->ic_prev = prev_iclog; prev_iclog = iclog; -- cgit v1.2.3-70-g09d2 From 948ecdb4c118293d2f3e267eec642c30c5d3a056 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Thu, 28 Sep 2006 11:03:13 +1000 Subject: [XFS] Be more defensive with page flags (error/private) for metadata buffers. SGI-PV: 955302 SGI-Modid: xfs-linux-melb:xfs-kern:26801a Signed-off-by: Nathan Scott Signed-off-by: Tim Shimmin --- fs/xfs/linux-2.6/xfs_buf.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/xfs/linux-2.6/xfs_buf.c') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 58b6599de61..9bbadafdcb0 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -318,8 +318,12 @@ xfs_buf_free( if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) free_address(bp->b_addr - bp->b_offset); - for (i = 0; i < bp->b_page_count; i++) - page_cache_release(bp->b_pages[i]); + for (i = 0; i < bp->b_page_count; i++) { + struct page *page = bp->b_pages[i]; + + ASSERT(!PagePrivate(page)); + page_cache_release(page); + } _xfs_buf_free_pages(bp); } else if (bp->b_flags & _XBF_KMEM_ALLOC) { /* @@ -400,6 +404,7 @@ _xfs_buf_lookup_pages( nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); size -= nbytes; + ASSERT(!PagePrivate(page)); if (!PageUptodate(page)) { page_count--; if (blocksize >= PAGE_CACHE_SIZE) { @@ -1117,10 +1122,10 @@ xfs_buf_bio_end_io( do { struct page *page = bvec->bv_page; + ASSERT(!PagePrivate(page)); if (unlikely(bp->b_error)) { if (bp->b_flags & XBF_READ) ClearPageUptodate(page); - SetPageError(page); } else if (blocksize >= PAGE_CACHE_SIZE) { SetPageUptodate(page); } else if (!PagePrivate(page) && -- cgit v1.2.3-70-g09d2