From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2011 08:52:07 +0100 Subject: block: remove per-queue plugging Code has been converted over to the new explicit on-stack plugging, and delay users have been converted to use the new API for that. So lets kill off the old plugging along with aops->sync_page(). Signed-off-by: Jens Axboe --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 17497d0cd8b..251bed73ac0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -358,7 +358,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi, static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page_nosync(page); + lock_page(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); -- cgit v1.2.3-70-g09d2 From d527caf22e48480b102c7c6ee5b9ba12170148f7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Mar 2011 16:30:38 -0700 Subject: mm: compaction: prevent kswapd compacting memory to reduce CPU usage This patch reverts 5a03b051 ("thp: use compaction in kswapd for GFP_ATOMIC order > 0") due to reports stating that kswapd CPU usage was higher and IRQs were being disabled more frequently. This was reported at http://www.spinics.net/linux/fedora/alsa-user/msg09885.html. Without this patch applied, CPU usage by kswapd hovers around the 20% mark according to the tester (Arthur Marsh: http://www.spinics.net/linux/fedora/alsa-user/msg09899.html). With this patch applied, it's around 2%. The problem is not related to THP which specifies __GFP_NO_KSWAPD but is triggered by high-order allocations hitting the low watermark for their order and waking kswapd on kernels with CONFIG_COMPACTION set. The most common trigger for this is network cards configured for jumbo frames but it's also possible it'll be triggered by fork-heavy workloads (order-1) and some wireless cards which depend on order-1 allocations. The symptoms for the user will be high CPU usage by kswapd in low-memory situations which could be confused with another writeback problem. While a patch like 5a03b051 may be reintroduced in the future, this patch plays it safe for now and reverts it. [mel@csn.ul.ie: Beefed up the changelog] Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Reported-by: Arthur Marsh Tested-by: Arthur Marsh Cc: [2.6.38.1] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 9 ++------- mm/compaction.c | 24 +++--------------------- mm/vmscan.c | 18 +----------------- 3 files changed, 6 insertions(+), 45 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index dfa2ed4c0d2..cc9f7a42864 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -11,9 +11,6 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 -#define COMPACT_MODE_DIRECT_RECLAIM 0 -#define COMPACT_MODE_KSWAPD 1 - #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -28,8 +25,7 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist, bool sync); extern unsigned long compaction_suitable(struct zone *zone, int order); extern unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync, - int compact_mode); + gfp_t gfp_mask, bool sync); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -74,8 +70,7 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) } static inline unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync, - int compact_mode) + gfp_t gfp_mask, bool sync) { return COMPACT_CONTINUE; } diff --git a/mm/compaction.c b/mm/compaction.c index 8be430b812d..dcb058bd76c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,8 +42,6 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - - int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -397,10 +395,7 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ - if (cc->compact_mode != COMPACT_MODE_KSWAPD) - watermark = low_wmark_pages(zone); - else - watermark = high_wmark_pages(zone); + watermark = low_wmark_pages(zone); watermark += (1 << cc->order); if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) @@ -413,15 +408,6 @@ static int compact_finished(struct zone *zone, if (cc->order == -1) return COMPACT_CONTINUE; - /* - * Generating only one page of the right order is not enough - * for kswapd, we must continue until we're above the high - * watermark as a pool for high order GFP_ATOMIC allocations - * too. - */ - if (cc->compact_mode == COMPACT_MODE_KSWAPD) - return COMPACT_CONTINUE; - /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { /* Job done if page is free of the right migratetype */ @@ -543,8 +529,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, - int compact_mode) + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -553,7 +538,6 @@ unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -599,8 +583,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, - COMPACT_MODE_DIRECT_RECLAIM); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -631,7 +614,6 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, - .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/vmscan.c b/mm/vmscan.c index 6771ea70bfe..3b4a41d7248 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2397,7 +2397,6 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { - int compaction; struct zone *zone = pgdat->node_zones + i; int nr_slab; @@ -2428,24 +2427,9 @@ loop_again: sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; - compaction = 0; - if (order && - zone_watermark_ok(zone, 0, - high_wmark_pages(zone), - end_zone, 0) && - !zone_watermark_ok(zone, order, - high_wmark_pages(zone), - end_zone, 0)) { - compact_zone_order(zone, - order, - sc.gfp_mask, false, - COMPACT_MODE_KSWAPD); - compaction = 1; - } - if (zone->all_unreclaimable) continue; - if (!compaction && nr_slab == 0 && + if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* -- cgit v1.2.3-70-g09d2 From e64a782fec684c29a8204c51b3cb554dce588592 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:44 -0700 Subject: mm: change __remove_from_page_cache() Now we renamed remove_from_page_cache with delete_from_page_cache. As consistency of __remove_from_swap_cache and remove_from_swap_cache, we change internal page cache handling function name, too. Signed-off-by: Minchan Kim Cc: Christoph Hellwig Acked-by: Hugh Dickins Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 +- mm/filemap.c | 8 ++++---- mm/memory-failure.c | 2 +- mm/truncate.c | 2 +- mm/vmscan.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ea36ff29a51..29ebba54c23 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -456,7 +456,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); -extern void __remove_from_page_cache(struct page *page); +extern void __delete_from_page_cache(struct page *page); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* diff --git a/mm/filemap.c b/mm/filemap.c index 1cfb8fd84b2..cb476e70cf1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -108,11 +108,11 @@ */ /* - * Remove a page from the page cache and free it. Caller has to make + * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */ -void __remove_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; @@ -154,7 +154,7 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -444,7 +444,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->index = offset; spin_lock_irq(&mapping->tree_lock); - __remove_from_page_cache(old); + __delete_from_page_cache(old); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 99ccb447262..e0af336530c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1130,7 +1130,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) /* * Now take care of user space mappings. - * Abort on fail: __remove_from_page_cache() assumes unmapped page. + * Abort on fail: __delete_from_page_cache() assumes unmapped page. */ if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); diff --git a/mm/truncate.c b/mm/truncate.c index 7c617b5f03e..3d2ae1f423d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -388,7 +388,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) clear_page_mlock(page); BUG_ON(page_has_private(page)); - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3b4a41d7248..665b090b6c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -514,7 +514,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) freepage = mapping->a_ops->freepage; - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); -- cgit v1.2.3-70-g09d2 From 8afdcece4911e51cfff2b50a269418914cab8a3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 22 Mar 2011 16:33:04 -0700 Subject: mm: vmscan: kswapd should not free an excessive number of pages when balancing small zones When reclaiming for order-0 pages, kswapd requires that all zones be balanced. Each cycle through balance_pgdat() does background ageing on all zones if necessary and applies equal pressure on the inactive zone unless a lot of pages are free already. A "lot of free pages" is defined as a "balance gap" above the high watermark which is currently 7*high_watermark. Historically this was reasonable as min_free_kbytes was small. However, on systems using huge pages, it is recommended that min_free_kbytes is higher and it is tuned with hugeadm --set-recommended-min_free_kbytes. With the introduction of transparent huge page support, this recommended value is also applied. On X86-64 with 4G of memory, min_free_kbytes becomes 67584 so one would expect around 68M of memory to be free. The Normal zone is approximately 35000 pages so under even normal memory pressure such as copying a large file, it gets exhausted quickly. As it is getting exhausted, kswapd applies pressure equally to all zones, including the DMA32 zone. DMA32 is approximately 700,000 pages with a high watermark of around 23,000 pages. In this situation, kswapd will reclaim around (23000*8 where 8 is the high watermark + balance gap of 7 * high watermark) pages or 718M of pages before the zone is ignored. What the user sees is that free memory far higher than it should be. To avoid an excessive number of pages being reclaimed from the larger zones, explicitely defines the "balance gap" to be either 1% of the zone or the low watermark for the zone, whichever is smaller. While kswapd will check all zones to apply pressure, it'll ignore zones that meets the (high_wmark + balance_gap) watermark. To test this, 80G were copied from a partition and the amount of memory being used was recorded. A comparison of a patch and unpatched kernel can be seen at http://www.csn.ul.ie/~mel/postings/minfree-20110222/memory-usage-hydra.ps and shows that kswapd is not reclaiming as much memory with the patch applied. Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Shaohua Li Cc: "Chen, Tim C" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 9 +++++++++ mm/vmscan.c | 16 +++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index c335055c425..ed6ebe690f4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -155,6 +155,15 @@ enum { #define SWAP_CLUSTER_MAX 32 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX +/* + * Ratio between the present memory in the zone and the "gap" that + * we're allowing kswapd to shrink in addition to the per-zone high + * wmark, even for zones that already have the high wmark satisfied, + * in order to provide better per-zone lru behavior. We are ok to + * spend not more than 1% of the memory for this zone balancing "gap". + */ +#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100 + #define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ #define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 665b090b6c7..060e4c19140 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2399,6 +2399,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; + unsigned long balance_gap; if (!populated_zone(zone)) continue; @@ -2415,11 +2416,20 @@ loop_again: mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); /* - * We put equal pressure on every zone, unless one - * zone has way too many pages free already. + * We put equal pressure on every zone, unless + * one zone has way too many pages free + * already. The "too many pages" is defined + * as the high wmark plus a "gap" where the + * gap is either the low watermark or 1% + * of the zone, whichever is smaller. */ + balance_gap = min(low_wmark_pages(zone), + (zone->present_pages + + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); if (!zone_watermark_ok_safe(zone, order, - 8*high_wmark_pages(zone), end_zone, 0)) + high_wmark_pages(zone) + balance_gap, + end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, -- cgit v1.2.3-70-g09d2