From d527caf22e48480b102c7c6ee5b9ba12170148f7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli <aarcange@redhat.com> Date: Tue, 22 Mar 2011 16:30:38 -0700 Subject: mm: compaction: prevent kswapd compacting memory to reduce CPU usage This patch reverts 5a03b051 ("thp: use compaction in kswapd for GFP_ATOMIC order > 0") due to reports stating that kswapd CPU usage was higher and IRQs were being disabled more frequently. This was reported at http://www.spinics.net/linux/fedora/alsa-user/msg09885.html. Without this patch applied, CPU usage by kswapd hovers around the 20% mark according to the tester (Arthur Marsh: http://www.spinics.net/linux/fedora/alsa-user/msg09899.html). With this patch applied, it's around 2%. The problem is not related to THP which specifies __GFP_NO_KSWAPD but is triggered by high-order allocations hitting the low watermark for their order and waking kswapd on kernels with CONFIG_COMPACTION set. The most common trigger for this is network cards configured for jumbo frames but it's also possible it'll be triggered by fork-heavy workloads (order-1) and some wireless cards which depend on order-1 allocations. The symptoms for the user will be high CPU usage by kswapd in low-memory situations which could be confused with another writeback problem. While a patch like 5a03b051 may be reintroduced in the future, this patch plays it safe for now and reverts it. [mel@csn.ul.ie: Beefed up the changelog] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reported-by: Arthur Marsh <arthur.marsh@internode.on.net> Tested-by: Arthur Marsh <arthur.marsh@internode.on.net> Cc: <stable@kernel.org> [2.6.38.1] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/compaction.c | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 8be430b812d..dcb058bd76c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,8 +42,6 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - - int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -397,10 +395,7 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ - if (cc->compact_mode != COMPACT_MODE_KSWAPD) - watermark = low_wmark_pages(zone); - else - watermark = high_wmark_pages(zone); + watermark = low_wmark_pages(zone); watermark += (1 << cc->order); if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) @@ -413,15 +408,6 @@ static int compact_finished(struct zone *zone, if (cc->order == -1) return COMPACT_CONTINUE; - /* - * Generating only one page of the right order is not enough - * for kswapd, we must continue until we're above the high - * watermark as a pool for high order GFP_ATOMIC allocations - * too. - */ - if (cc->compact_mode == COMPACT_MODE_KSWAPD) - return COMPACT_CONTINUE; - /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { /* Job done if page is free of the right migratetype */ @@ -543,8 +529,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, - int compact_mode) + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -553,7 +538,6 @@ unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -599,8 +583,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, - COMPACT_MODE_DIRECT_RECLAIM); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -631,7 +614,6 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, - .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; -- cgit v1.2.3-70-g09d2 From 9d502c1c8d47b337c378c2ac8eaeee7918ad16b1 Mon Sep 17 00:00:00 2001 From: Minchan Kim <minchan.kim@gmail.com> Date: Tue, 22 Mar 2011 16:30:39 -0700 Subject: mm/compaction: check migrate_pages's return value instead of list_empty() Many migrate_page's caller check return value instead of list_empy by cf608ac19c ("mm: compaction: fix COMPACTPAGEFAILED counting"). This patch makes compaction's migrate_pages consistent with others. This patch should not change old behavior. Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/compaction.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index dcb058bd76c..38ce48805c0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -494,12 +494,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { unsigned long nr_migrate, nr_remaining; + int err; if (!isolate_migratepages(zone, cc)) continue; nr_migrate = cc->nr_migratepages; - migrate_pages(&cc->migratepages, compaction_alloc, + err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, cc->sync); update_nr_listpages(cc); @@ -513,7 +514,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_remaining); /* Release LRU pages not migrated */ - if (!list_empty(&cc->migratepages)) { + if (err) { putback_lru_pages(&cc->migratepages); cc->nr_migratepages = 0; } -- cgit v1.2.3-70-g09d2 From 602605a42ea4c299aeed4d806c49fb9dd18cd204 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mel@csn.ul.ie> Date: Tue, 22 Mar 2011 16:33:08 -0700 Subject: mm: compaction: minimise the time IRQs are disabled while isolating free pages compaction_alloc() isolates free pages to be used as migration targets. While its scanning, IRQs are disabled on the mistaken assumption the scanning should be short. Analysis showed that IRQs were in fact being disabled for substantial time. A simple test was run using large anonymous mappings with transparent hugepage support enabled to trigger frequent compactions. A monitor sampled what the worst IRQ-off latencies were and a post-processing tool found the following; Total sampled time IRQs off (not real total time): 22355 Event compaction_alloc..compaction_alloc 8409 us count 1 Event compaction_alloc..compaction_alloc 7341 us count 1 Event compaction_alloc..compaction_alloc 2463 us count 1 Event compaction_alloc..compaction_alloc 2054 us count 1 Event shrink_inactive_list..shrink_zone 1864 us count 1 Event shrink_inactive_list..shrink_zone 88 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __make_request..__blk_run_queue 24 us count 1 Event __alloc_pages_nodemask..__alloc_pages_nodemask 6 us count 1 i.e. compaction is disabled IRQs for a prolonged period of time - 8ms in one instance. The full report generated by the tool can be found at http://www.csn.ul.ie/~mel/postings/minfree-20110225/irqsoff-vanilla-micro.report This patch reduces the time IRQs are disabled by simply disabling IRQs at the last possible minute. An updated IRQs-off summary report then looks like; Total sampled time IRQs off (not real total time): 5493 Event shrink_inactive_list..shrink_zone 1596 us count 1 Event shrink_inactive_list..shrink_zone 1530 us count 1 Event shrink_inactive_list..shrink_zone 956 us count 1 Event shrink_inactive_list..shrink_zone 541 us count 1 Event shrink_inactive_list..shrink_zone 531 us count 1 Event split_huge_page..add_to_swap 232 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __wake_up..__wake_up 1 us count 1 A full report is again available at http://www.csn.ul.ie/~mel/postings/minfree-20110225/irqsoff-minimiseirq-free-v1r4-micro.report As should be obvious, IRQ disabled latencies due to compaction are almost elimimnated for this particular test. [aarcange@redhat.com: Fix initialisation of isolated] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujisu.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Acked-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Arthur Marsh <arthur.marsh@internode.on.net> Cc: Clemens Ladisch <cladisch@googlemail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/compaction.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 38ce48805c0..b27802e04b9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -153,7 +153,6 @@ static void isolate_freepages(struct zone *zone, * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - spin_lock_irqsave(&zone->lock, flags); for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; @@ -176,9 +175,19 @@ static void isolate_freepages(struct zone *zone, if (!suitable_migration_target(page)) continue; - /* Found a block suitable for isolating free pages from */ - isolated = isolate_freepages_block(zone, pfn, freelist); - nr_freepages += isolated; + /* + * Found a block suitable for isolating free pages from. Now + * we disabled interrupts, double check things are ok and + * isolate the pages. This is to minimise the time IRQs + * are disabled + */ + isolated = 0; + spin_lock_irqsave(&zone->lock, flags); + if (suitable_migration_target(page)) { + isolated = isolate_freepages_block(zone, pfn, freelist); + nr_freepages += isolated; + } + spin_unlock_irqrestore(&zone->lock, flags); /* * Record the highest PFN we isolated pages from. When next @@ -188,7 +197,6 @@ static void isolate_freepages(struct zone *zone, if (isolated) high_pfn = max(high_pfn, pfn); } - spin_unlock_irqrestore(&zone->lock, flags); /* split_free_page does not map the pages */ list_for_each_entry(page, freelist, lru) { -- cgit v1.2.3-70-g09d2 From b2eef8c0d09101bbbff2531c097543aedde0b525 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli <aarcange@redhat.com> Date: Tue, 22 Mar 2011 16:33:10 -0700 Subject: mm: compaction: minimise the time IRQs are disabled while isolating pages for migration compaction_alloc() isolates pages for migration in isolate_migratepages. While it's scanning, IRQs are disabled on the mistaken assumption the scanning should be short. Tests show this to be true for the most part but contention times on the LRU lock can be increased. Before this patch, the IRQ disabled times for a simple test looked like Total sampled time IRQs off (not real total time): 5493 Event shrink_inactive_list..shrink_zone 1596 us count 1 Event shrink_inactive_list..shrink_zone 1530 us count 1 Event shrink_inactive_list..shrink_zone 956 us count 1 Event shrink_inactive_list..shrink_zone 541 us count 1 Event shrink_inactive_list..shrink_zone 531 us count 1 Event split_huge_page..add_to_swap 232 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __wake_up..__wake_up 1 us count 1 This patch reduces the worst-case IRQs-disabled latencies by releasing the lock every SWAP_CLUSTER_MAX pages that are scanned and releasing the CPU if necessary. The cost of this is that the processing performing compaction will be slower but IRQs being disabled for too long a time has worse consequences as the following report shows; Total sampled time IRQs off (not real total time): 4367 Event shrink_inactive_list..shrink_zone 881 us count 1 Event shrink_inactive_list..shrink_zone 875 us count 1 Event shrink_inactive_list..shrink_zone 868 us count 1 Event shrink_inactive_list..shrink_zone 555 us count 1 Event split_huge_page..add_to_swap 495 us count 1 Event compact_zone..compact_zone_order 269 us count 1 Event split_huge_page..add_to_swap 266 us count 1 Event shrink_inactive_list..shrink_zone 85 us count 1 Event save_args..call_softirq 36 us count 2 Event __wake_up..__wake_up 1 us count 1 [akpm@linux-foundation.org: simplify with s/unlocked/locked/] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Arthur Marsh <arthur.marsh@internode.on.net> Cc: Clemens Ladisch <cladisch@googlemail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/compaction.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index b27802e04b9..021a2960ef9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -277,9 +277,27 @@ static unsigned long isolate_migratepages(struct zone *zone, } /* Time to isolate some pages for migration */ + cond_resched(); spin_lock_irq(&zone->lru_lock); for (; low_pfn < end_pfn; low_pfn++) { struct page *page; + bool locked = true; + + /* give a chance to irqs before checking need_resched() */ + if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { + spin_unlock_irq(&zone->lru_lock); + locked = false; + } + if (need_resched() || spin_is_contended(&zone->lru_lock)) { + if (locked) + spin_unlock_irq(&zone->lru_lock); + cond_resched(); + spin_lock_irq(&zone->lru_lock); + if (fatal_signal_pending(current)) + break; + } else if (!locked) + spin_lock_irq(&zone->lru_lock); + if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; -- cgit v1.2.3-70-g09d2