From 9bc08a45fb117c696e4940cfa1208cb1cc7a2f25 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Sep 2010 15:14:38 +1000 Subject: xfs: improve buffer cache hash scalability When doing large parallel file creates on a 16p machines, large amounts of time is being spent in _xfs_buf_find(). A system wide profile with perf top shows this: 1134740.00 19.3% _xfs_buf_find 733142.00 12.5% __ticket_spin_lock The problem is that the hash contains 45,000 buffers, and the hash table width is only 256 buffers. That means we've got around 200 buffers per chain, and searching it is quite expensive. The hash table size needs to increase. Secondly, every time we do a lookup, we promote the buffer we find to the head of the hash chain. This is causing cachelines to be dirtied and causes invalidation of cachelines across all CPUs that may have walked the hash chain recently. hence every walk of the hash chain is effectively a cold cache walk. Remove the promotion to avoid this invalidation. The results are: 1045043.00 21.2% __ticket_spin_lock 326184.00 6.6% _xfs_buf_find A 70% drop in the CPU usage when looking up buffers. Unfortunately that does not result in an increase in performance underthis workload as contention on the inode_lock soaks up most of the reduction in CPU usage. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_buf.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs/xfs/linux-2.6/xfs_buf.c') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ea79072f521..d72cf2bb054 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -440,12 +440,7 @@ _xfs_buf_find( ASSERT(btp == bp->b_target); if (bp->b_file_offset == range_base && bp->b_buffer_length == range_length) { - /* - * If we look at something, bring it to the - * front of the list for next time. - */ atomic_inc(&bp->b_hold); - list_move(&bp->b_hash_list, &hash->bh_list); goto found; } } @@ -1443,8 +1438,7 @@ xfs_alloc_bufhash( { unsigned int i; - btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ - btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; + btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); for (i = 0; i < (1 << btp->bt_hashshift); i++) { -- cgit v1.2.3-70-g09d2 From 51749e47e191db8e588ad5cebea731caf7b705d7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 8 Sep 2010 09:00:22 +0000 Subject: xfs: log IO completion workqueue is a high priority queue The workqueue implementation in 2.6.36-rcX has changed, resulting in the workqueues no longer having dedicated threads for work processing. This has caused severe livelocks under heavy parallel create workloads because the log IO completions have been getting held up behind metadata IO completions. Hence log commits would stall, memory allocation would stall because pages could not be cleaned, and lock contention on the AIL during inode IO completion processing was being seen to slow everything down even further. By making the log Io completion workqueue a high priority workqueue, they are queued ahead of all data/metadata IO completions and processed before the data/metadata completions. Hence the log never gets stalled, and operations needed to clean memory can continue as quickly as possible. This avoids the livelock conditions and allos the system to keep running under heavy load as per normal. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/xfs/linux-2.6/xfs_buf.c') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index d72cf2bb054..286e36e21da 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1932,7 +1932,8 @@ xfs_buf_init(void) if (!xfs_buf_zone) goto out; - xfslogd_workqueue = create_workqueue("xfslogd"); + xfslogd_workqueue = alloc_workqueue("xfslogd", + WQ_RESCUER | WQ_HIGHPRI, 1); if (!xfslogd_workqueue) goto out_free_buf_zone; -- cgit v1.2.3-70-g09d2