From 315601809d124d046abd6c3ffa346d0dbd7aa29d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:52 -0700 Subject: mm: deactivate invalidated pages Recently, there are reported problem about thrashing. (http://marc.info/?l=rsync&m=128885034930933&w=2) It happens by backup workloads(ex, nightly rsync). That's because the workload makes just use-once pages and touches pages twice. It promotes the page into active list so that it results in working set page eviction. Some app developer want to support POSIX_FADV_NOREUSE. But other OSes don't support it, either. (http://marc.info/?l=linux-mm&m=128928979512086&w=2) By other approach, app developers use POSIX_FADV_DONTNEED. But it has a problem. If kernel meets page is writing during invalidate_mapping_pages, it can't work. It makes for application programmer to use it since they always have to sync data before calling fadivse(..POSIX_FADV_DONTNEED) to make sure the pages could be discardable. At last, they can't use deferred write of kernel so that they could see performance loss. (http://insights.oetiker.ch/linux/fadvise.html) In fact, invalidation is very big hint to reclaimer. It means we don't use the page any more. So let's move the writing page into inactive list's head if we can't truncate it right now. Why I move page to head of lru on this patch, Dirty/Writeback page would be flushed sooner or later. It can prevent writeout of pageout which is less effective than flusher's writeout. Originally, I reused lru_demote of Peter with some change so added his Signed-off-by. Signed-off-by: Minchan Kim Reported-by: Ben Gamari Signed-off-by: Peter Zijlstra Acked-by: Rik van Riel Acked-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Cc: Wu Fengguang Acked-by: Johannes Weiner Cc: Nick Piggin Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'mm/swap.c') diff --git a/mm/swap.c b/mm/swap.c index c02f93611a8..4aea806d0d4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -39,6 +39,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -346,6 +347,60 @@ void add_page_to_unevictable_list(struct page *page) spin_unlock_irq(&zone->lru_lock); } +/* + * If the page can not be invalidated, it is moved to the + * inactive list to speed up its reclaim. It is moved to the + * head of the list, rather than the tail, to give the flusher + * threads some time to write it out, as this is much more + * effective than the single-page writeout from reclaim. + */ +static void lru_deactivate(struct page *page, struct zone *zone) +{ + int lru, file; + + if (!PageLRU(page) || !PageActive(page)) + return; + + /* Some processes are using the page */ + if (page_mapped(page)) + return; + + file = page_is_file_cache(page); + lru = page_lru_base_type(page); + del_page_from_lru_list(zone, page, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(zone, page, lru); + __count_vm_event(PGDEACTIVATE); + + update_page_reclaim_stat(zone, page, file, 0); +} + +static void ____pagevec_lru_deactivate(struct pagevec *pvec) +{ + int i; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + lru_deactivate(page, zone); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + + /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -372,6 +427,29 @@ static void drain_cpu_pagevecs(int cpu) pagevec_move_tail(pvec); local_irq_restore(flags); } + + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + ____pagevec_lru_deactivate(pvec); +} + +/** + * deactivate_page - forcefully deactivate a page + * @page: page to deactivate + * + * This function hints the VM that @page is a good reclaim candidate, + * for example if its invalidation fails due to the page being dirty + * or under writeback. + */ +void deactivate_page(struct page *page) +{ + if (likely(get_page_unless_zero(page))) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + if (!pagevec_add(pvec, page)) + ____pagevec_lru_deactivate(pvec); + put_cpu_var(lru_deactivate_pvecs); + } } void lru_add_drain(void) -- cgit v1.2.3-70-g09d2 From 3f58a82943337fb6e79acfa5346719a97d3c0b98 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:53 -0700 Subject: memcg: move memcg reclaimable page into tail of inactive list The rotate_reclaimable_page function moves just written out pages, which the VM wanted to reclaim, to the end of the inactive list. That way the VM will find those pages first next time it needs to free memory. This patch applies the rule in memcg. It can help to prevent unnecessary working page eviction of memcg. Signed-off-by: Minchan Kim Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Rik van Riel Cc: KOSAKI Motohiro Acked-by: Johannes Weiner Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 26 ++++++++++++++++++++++++++ mm/swap.c | 3 ++- 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'mm/swap.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a1a1e5384f6..5bb7be2628c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -62,6 +62,7 @@ extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru); extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru); +extern void mem_cgroup_rotate_reclaimable_page(struct page *page); extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); extern void mem_cgroup_del_lru(struct page *page); extern void mem_cgroup_move_lists(struct page *page, @@ -211,6 +212,11 @@ static inline void mem_cgroup_del_lru_list(struct page *page, int lru) return ; } +static inline inline void mem_cgroup_rotate_reclaimable_page(struct page *page) +{ + return ; +} + static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru) { return ; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6ef5c53dffc..9e0f05efd11 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -829,6 +829,32 @@ void mem_cgroup_del_lru(struct page *page) mem_cgroup_del_lru_list(page, page_lru(page)); } +/* + * Writeback is about to end against a page which has been marked for immediate + * reclaim. If it still appears to be reclaimable, move it to the tail of the + * inactive list. + */ +void mem_cgroup_rotate_reclaimable_page(struct page *page) +{ + struct mem_cgroup_per_zone *mz; + struct page_cgroup *pc; + enum lru_list lru = page_lru(page); + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(page); + /* unused or root page is not rotated. */ + if (!PageCgroupUsed(pc)) + return; + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; + mz = page_cgroup_zoneinfo(pc); + list_move_tail(&pc->lru, &mz->lists[lru]); +} + void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) { struct mem_cgroup_per_zone *mz; diff --git a/mm/swap.c b/mm/swap.c index 4aea806d0d4..1b9e4ebaffc 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -200,8 +200,9 @@ static void pagevec_move_tail(struct pagevec *pvec) spin_lock(&zone->lru_lock); } if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); + enum lru_list lru = page_lru_base_type(page); list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); pgmoved++; } } -- cgit v1.2.3-70-g09d2 From 278df9f451dc71dcd002246be48358a473504ad0 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:54 -0700 Subject: mm: reclaim invalidated page ASAP invalidate_mapping_pages is very big hint to reclaimer. It means user doesn't want to use the page any more. So in order to prevent working set page eviction, this patch move the page into tail of inactive list by PG_reclaim. Please, remember that pages in inactive list are working set as well as active list. If we don't move pages into inactive list's tail, pages near by tail of inactive list can be evicted although we have a big clue about useless pages. It's totally bad. Now PG_readahead/PG_reclaim is shared. fe3cba17 added ClearPageReclaim into clear_page_dirty_for_io for preventing fast reclaiming readahead marker page. In this series, PG_reclaim is used by invalidated page, too. If VM find the page is invalidated and it's dirty, it sets PG_reclaim to reclaim asap. Then, when the dirty page will be writeback, clear_page_dirty_for_io will clear PG_reclaim unconditionally. It disturbs this serie's goal. I think it's okay to clear PG_readahead when the page is dirty, not writeback time. So this patch moves ClearPageReadahead. In v4, ClearPageReadahead in set_page_dirty has a problem which is reported by Steven Barrett. It's due to compound page. Some driver(ex, audio) calls set_page_dirty with compound page which isn't on LRU. but my patch does ClearPageRelcaim on compound page. In non-CONFIG_PAGEFLAGS_EXTENDED, it breaks PageTail flag. I think it doesn't affect THP and pass my test with THP enabling but Cced Andrea for double check. Signed-off-by: Minchan Kim Reported-by: Steven Barrett Reviewed-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Mel Gorman Cc: Wu Fengguang Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 12 +++++++++++- mm/swap.c | 41 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 4 deletions(-) (limited to 'mm/swap.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2cb01f6ec5d..b437fe6257b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1211,6 +1211,17 @@ int set_page_dirty(struct page *page) if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; + /* + * readahead/lru_deactivate_page could remain + * PG_readahead/PG_reclaim due to race with end_page_writeback + * About readahead, if the page is written, the flags would be + * reset. So no problem. + * About lru_deactivate_page, if the page is redirty, the flag + * will be reset. So no problem. but if the page is used by readahead + * it will confuse readahead and make it restart the size rampup + * process. But it's a trivial problem. + */ + ClearPageReclaim(page); #ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; @@ -1266,7 +1277,6 @@ int clear_page_dirty_for_io(struct page *page) BUG_ON(!PageLocked(page)); - ClearPageReclaim(page); if (mapping && mapping_cap_account_dirty(mapping)) { /* * Yes, Virginia, this is indeed insane. diff --git a/mm/swap.c b/mm/swap.c index 1b9e4ebaffc..0a33714a7cb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -354,26 +354,61 @@ void add_page_to_unevictable_list(struct page *page) * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. + * + * If the page isn't page_mapped and dirty/writeback, the page + * could reclaim asap using PG_reclaim. + * + * 1. active, mapped page -> none + * 2. active, dirty/writeback page -> inactive, head, PG_reclaim + * 3. inactive, mapped page -> none + * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim + * 5. inactive, clean -> inactive, tail + * 6. Others -> none + * + * In 4, why it moves inactive's head, the VM expects the page would + * be write it out by flusher threads as this is much more effective + * than the single-page writeout from reclaim. */ static void lru_deactivate(struct page *page, struct zone *zone) { int lru, file; + bool active; - if (!PageLRU(page) || !PageActive(page)) + if (!PageLRU(page)) return; /* Some processes are using the page */ if (page_mapped(page)) return; + active = PageActive(page); + file = page_is_file_cache(page); lru = page_lru_base_type(page); - del_page_from_lru_list(zone, page, lru + LRU_ACTIVE); + del_page_from_lru_list(zone, page, lru + active); ClearPageActive(page); ClearPageReferenced(page); add_page_to_lru_list(zone, page, lru); - __count_vm_event(PGDEACTIVATE); + if (PageWriteback(page) || PageDirty(page)) { + /* + * PG_reclaim could be raced with end_page_writeback + * It can make readahead confusing. But race window + * is _really_ small and it's non-critical problem. + */ + SetPageReclaim(page); + } else { + /* + * The page's writeback ends up during pagevec + * We moves tha page into tail of inactive. + */ + list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); + __count_vm_event(PGROTATED); + } + + if (active) + __count_vm_event(PGDEACTIVATE); update_page_reclaim_stat(zone, page, file, 0); } -- cgit v1.2.3-70-g09d2 From 3dd7ae8ec0ef399bfea347f297d2a95504d35571 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 22 Mar 2011 16:33:45 -0700 Subject: mm: simplify code of swap.c Clean up code and remove duplicate code. Next patch will use pagevec_lru_move_fn introduced here too. Signed-off-by: Shaohua Li Cc: KOSAKI Motohiro Cc: Hiroyuki Kamezawa Cc: Andi Kleen Reviewed-by: Minchan Kim Cc: Rik van Riel Cc: Mel Gorman Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 133 +++++++++++++++++++++++++++----------------------------------- 1 file changed, 58 insertions(+), 75 deletions(-) (limited to 'mm/swap.c') diff --git a/mm/swap.c b/mm/swap.c index 0a33714a7cb..a448db377cb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -179,15 +179,13 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) +static void pagevec_lru_move_fn(struct pagevec *pvec, + void (*move_fn)(struct page *page, void *arg), + void *arg) { int i; - int pgmoved = 0; struct zone *zone = NULL; + unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -195,30 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec) if (pagezone != zone) { if (zone) - spin_unlock(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; - spin_lock(&zone->lru_lock); - } - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); - pgmoved++; + spin_lock_irqsave(&zone->lru_lock, flags); } + + (*move_fn)(page, arg); } if (zone) - spin_unlock(&zone->lru_lock); - __count_vm_events(PGROTATED, pgmoved); + spin_unlock_irqrestore(&zone->lru_lock, flags); release_pages(pvec->pages, pvec->nr, pvec->cold); pagevec_reinit(pvec); } +static void pagevec_move_tail_fn(struct page *page, void *arg) +{ + int *pgmoved = arg; + struct zone *zone = page_zone(page); + + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + enum lru_list lru = page_lru_base_type(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); + (*pgmoved)++; + } +} + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int pgmoved = 0; + + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + __count_vm_events(PGROTATED, pgmoved); +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && !PageUnevictable(page) && PageLRU(page)) { @@ -369,10 +387,11 @@ void add_page_to_unevictable_list(struct page *page) * be write it out by flusher threads as this is much more effective * than the single-page writeout from reclaim. */ -static void lru_deactivate(struct page *page, struct zone *zone) +static void lru_deactivate_fn(struct page *page, void *arg) { int lru, file; bool active; + struct zone *zone = page_zone(page); if (!PageLRU(page)) return; @@ -412,31 +431,6 @@ static void lru_deactivate(struct page *page, struct zone *zone) update_page_reclaim_stat(zone, page, file, 0); } -static void ____pagevec_lru_deactivate(struct pagevec *pvec) -{ - int i; - struct zone *zone = NULL; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - lru_deactivate(page, zone); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); -} - - /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -466,7 +460,7 @@ static void drain_cpu_pagevecs(int cpu) pvec = &per_cpu(lru_deactivate_pvecs, cpu); if (pagevec_count(pvec)) - ____pagevec_lru_deactivate(pvec); + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); } /** @@ -483,7 +477,7 @@ void deactivate_page(struct page *page) struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); if (!pagevec_add(pvec, page)) - ____pagevec_lru_deactivate(pvec); + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); put_cpu_var(lru_deactivate_pvecs); } } @@ -630,44 +624,33 @@ void lru_add_page_tail(struct zone* zone, } } +static void ____pagevec_lru_add_fn(struct page *page, void *arg) +{ + enum lru_list lru = (enum lru_list)arg; + struct zone *zone = page_zone(page); + int file = is_file_lru(lru); + int active = is_active_lru(lru); + + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + + SetPageLRU(page); + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { - int i; - struct zone *zone = NULL; - VM_BUG_ON(is_unevictable_lru(lru)); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - int file; - int active; - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - active = is_active_lru(lru); - file = is_file_lru(lru); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); + pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); } EXPORT_SYMBOL(____pagevec_lru_add); -- cgit v1.2.3-70-g09d2