From 117aad1e9e4d97448d1df3f84b08bd65811e6d6a Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Mon, 30 Sep 2013 13:45:16 -0700 Subject: mm: avoid reinserting isolated balloon pages into LRU lists Isolated balloon pages can wrongly end up in LRU lists when migrate_pages() finishes its round without draining all the isolated page list. The same issue can happen when reclaim_clean_pages_from_list() tries to reclaim pages from an isolated page list, before migration, in the CMA path. Such balloon page leak opens a race window against LRU lists shrinkers that leads us to the following kernel panic: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [] shrink_page_list+0x24e/0x897 PGD 3cda2067 PUD 3d713067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 340 Comm: kswapd0 Not tainted 3.12.0-rc1-22626-g4367597 #87 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: shrink_page_list+0x24e/0x897 RSP: 0000:ffff88003da499b8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88003e82bd60 RCX: 00000000000657d5 RDX: 0000000000000000 RSI: 000000000000031f RDI: ffff88003e82bd40 RBP: ffff88003da49ab0 R08: 0000000000000001 R09: 0000000081121a45 R10: ffffffff81121a45 R11: ffff88003c4a9a28 R12: ffff88003e82bd40 R13: ffff88003da0e800 R14: 0000000000000001 R15: ffff88003da49d58 FS: 0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000067d9000 CR3: 000000003ace5000 CR4: 00000000000407b0 Call Trace: shrink_inactive_list+0x240/0x3de shrink_lruvec+0x3e0/0x566 __shrink_zone+0x94/0x178 shrink_zone+0x3a/0x82 balance_pgdat+0x32a/0x4c2 kswapd+0x2f0/0x372 kthread+0xa2/0xaa ret_from_fork+0x7c/0xb0 Code: 80 7d 8f 01 48 83 95 68 ff ff ff 00 4c 89 e7 e8 5a 7b 00 00 48 85 c0 49 89 c5 75 08 80 7d 8f 00 74 3e eb 31 48 8b 80 18 01 00 00 <48> 8b 74 0d 48 8b 78 30 be 02 00 00 00 ff d2 eb RIP [] shrink_page_list+0x24e/0x897 RSP CR2: 0000000000000028 ---[ end trace 703d2451af6ffbfd ]--- Kernel panic - not syncing: Fatal exception This patch fixes the issue, by assuring the proper tests are made at putback_movable_pages() & reclaim_clean_pages_from_list() to avoid isolated balloon pages being wrongly reinserted in LRU lists. [akpm@linux-foundation.org: clarify awkward comment text] Signed-off-by: Rafael Aquini Reported-by: Luiz Capitulino Tested-by: Luiz Capitulino Cc: Mel Gorman Cc: Rik van Riel Cc: Hugh Dickins Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 9c8d5f59d30..a26bccd44cc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -107,7 +107,7 @@ void putback_movable_pages(struct list_head *l) list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (unlikely(balloon_page_movable(page))) + if (unlikely(isolated_balloon_page(page))) balloon_page_putback(page); else putback_lru_page(page); -- cgit v1.2.3-70-g09d2 From c3d16e16522fe3fe8759735850a0676da18f4b1d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 16 Oct 2013 13:46:51 -0700 Subject: mm: migration: do not lose soft dirty bit if page is in migration state If page migration is turned on in config and the page is migrating, we may lose the soft dirty bit. If fork and mprotect are called on migrating pages (once migration is complete) pages do not obtain the soft dirty bit in the correspond pte entries. Fix it adding an appropriate test on swap entries. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andy Lutomirski Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 ++ mm/migrate.c | 2 ++ mm/mprotect.c | 7 +++++-- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/memory.c b/mm/memory.c index ca000394711..f7b7692c05e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -837,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } } diff --git a/mm/migrate.c b/mm/migrate.c index a26bccd44cc..7a7325ee1d0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*ptep)) + pte = pte_mksoft_dirty(pte); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b4..a3af058f68e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -94,13 +94,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { + pte_t newpte; /* * A protection check is difficult so * just be safe and disable write */ make_migration_entry_read(&entry); - set_pte_at(mm, addr, pte, - swp_entry_to_pte(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + set_pte_at(mm, addr, pte, newpte); } pages++; } -- cgit v1.2.3-70-g09d2 From 3f926ab945b60a5824369d21add7710622a2eac0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:46 +0100 Subject: mm: Close races between THP migration and PMD numa clearing THP migration uses the page lock to guard against parallel allocations but there are cases like this still open Task A Task B --------------------- --------------------- do_huge_pmd_numa_page do_huge_pmd_numa_page lock_page mpol_misplaced == -1 unlock_page goto clear_pmdnuma lock_page mpol_misplaced == 2 migrate_misplaced_transhuge pmd = pmd_mknonnuma set_pmd_at During hours of testing, one crashed with weird errors and while I have no direct evidence, I suspect something like the race above happened. This patch extends the page lock to being held until the pmd_numa is cleared to prevent migration starting in parallel while the pmd_numa is being cleared. It also flushes the old pmd entry and orders pagetable insertion before rmap insertion. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 33 +++++++++++++++------------------ mm/migrate.c | 19 +++++++++++-------- 2 files changed, 26 insertions(+), 26 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 00ddfcdd810..cca80d96e50 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1304,24 +1304,25 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, target_nid = mpol_misplaced(page, vma, haddr); if (target_nid == -1) { /* If the page was locked, there are no parallel migrations */ - if (page_locked) { - unlock_page(page); + if (page_locked) goto clear_pmdnuma; - } - /* Otherwise wait for potential migrations and retry fault */ + /* + * Otherwise wait for potential migrations and retry. We do + * relock and check_same as the page may no longer be mapped. + * As the fault is being retried, do not account for it. + */ spin_unlock(&mm->page_table_lock); wait_on_page_locked(page); + page_nid = -1; goto out; } /* Page is misplaced, serialise migrations and parallel THP splits */ get_page(page); spin_unlock(&mm->page_table_lock); - if (!page_locked) { + if (!page_locked) lock_page(page); - page_locked = true; - } anon_vma = page_lock_anon_vma_read(page); /* Confirm the PTE did not while locked */ @@ -1329,32 +1330,28 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); put_page(page); + page_nid = -1; goto out_unlock; } - /* Migrate the THP to the requested node */ + /* + * Migrate the THP to the requested node, returns with page unlocked + * and pmd_numa cleared. + */ spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); if (migrated) page_nid = target_nid; - else - goto check_same; goto out; - -check_same: - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) { - /* Someone else took our fault */ - page_nid = -1; - goto out_unlock; - } clear_pmdnuma: + BUG_ON(!PageLocked(page)); pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); + unlock_page(page); out_unlock: spin_unlock(&mm->page_table_lock); diff --git a/mm/migrate.c b/mm/migrate.c index 7a7325ee1d0..c04692774e8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1715,12 +1715,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unlock_page(new_page); put_page(new_page); /* Free it */ - unlock_page(page); + /* Retake the callers reference and putback on LRU */ + get_page(page); putback_lru_page(page); - - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - isolated = 0; - goto out; + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); + goto out_fail; } /* @@ -1737,9 +1737,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); - page_add_new_anon_rmap(new_page, vma, haddr); - + pmdp_clear_flush(vma, haddr, pmd); set_pmd_at(mm, haddr, pmd, entry); + page_add_new_anon_rmap(new_page, vma, haddr); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(page); /* @@ -1758,7 +1758,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); -out: mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); @@ -1767,6 +1766,10 @@ out: out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: + entry = pmd_mknonnuma(entry); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + unlock_page(page); put_page(page); return 0; -- cgit v1.2.3-70-g09d2