From 8bfa3f9a012c4049f3d661f7a772cd9c27a7da99 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Tue, 12 Nov 2013 15:07:16 -0800 Subject: mm/huge_memory.c: fix stale comments of transparent_hugepage_flags Since commit 13ece886d99c ("thp: transparent hugepage config choice"), transparent hugepage support is disabled by default, and TRANSPARENT_HUGEPAGE_ALWAYS is configured when TRANSPARENT_HUGEPAGE=y. And since commit d39d33c332c6 ("thp: enable direct defrag"), defrag is enable for all transparent hugepage page faults by default, not only in MADV_HUGEPAGE regions. Signed-off-by: Jianguo Wu Reviewed-by: Wanpeng Li Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2612f60f53e..4f7e2113646 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -27,11 +27,12 @@ #include "internal.h" /* - * By default transparent hugepage support is enabled for all mappings - * and khugepaged scans all mappings. Defrag is only invoked by - * khugepaged hugepage allocations and by page faults inside - * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived - * allocations. + * By default transparent hugepage support is disabled in order that avoid + * to risk increase the memory footprint of applications without a guaranteed + * benefit. When transparent hugepage support is enabled, is for all mappings, + * and khugepaged scans all mappings. + * Defrag is invoked by khugepaged hugepage allocations and by page faults + * for all hugepage allocations. */ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS -- cgit v1.2.3-70-g09d2 From 10dc4155c7714f508fe2e4667164925ea971fb25 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 12 Nov 2013 15:07:35 -0800 Subject: mm: thp: cleanup: mv alloc_hugepage to better place Move alloc_hugepage() to a better place, no need for a seperate #ifndef CONFIG_NUMA Signed-off-by: Bob Liu Reviewed-by: Yasuaki Ishimatsu Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Andrew Davidoff Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4f7e2113646..411c4f2c049 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -759,14 +759,6 @@ static inline struct page *alloc_hugepage_vma(int defrag, HPAGE_PMD_ORDER, vma, haddr, nd); } -#ifndef CONFIG_NUMA -static inline struct page *alloc_hugepage(int defrag) -{ - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); -} -#endif - static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) @@ -2251,6 +2243,12 @@ static struct page return *hpage; } #else +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), + HPAGE_PMD_ORDER); +} + static struct page *khugepaged_alloc_hugepage(bool *wait) { struct page *hpage; -- cgit v1.2.3-70-g09d2 From 9f1b868a13ac36bd207a571f5ea1193d823ab18d Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 12 Nov 2013 15:07:37 -0800 Subject: mm: thp: khugepaged: add policy for finding target node Khugepaged will scan/free HPAGE_PMD_NR normal pages and replace with a hugepage which is allocated from the node of the first scanned normal page, but this policy is too rough and may end with unexpected result to upper users. The problem is the original page-balancing among all nodes will be broken after hugepaged started. Thinking about the case if the first scanned normal page is allocated from node A, most of other scanned normal pages are allocated from node B or C.. But hugepaged will always allocate hugepage from node A which will cause extra memory pressure on node A which is not the situation before khugepaged started. This patch try to fix this problem by making khugepaged allocate hugepage from the node which have max record of scaned normal pages hit, so that the effect to original page-balancing can be minimized. The other problem is if normal scanned pages are equally allocated from Node A,B and C, after khugepaged started Node A will still suffer extra memory pressure. Andrew Davidoff reported a related issue several days ago. He wanted his application interleaving among all nodes and "numactl --interleave=all ./test" was used to run the testcase, but the result wasn't not as expected. cat /proc/2814/numa_maps: 7f50bd440000 interleave:0-3 anon=51403 dirty=51403 N0=435 N1=435 N2=435 N3=50098 The end result showed that most pages are from Node3 instead of interleave among node0-3 which was unreasonable. This patch also fix this issue by allocating hugepage round robin from all nodes have the same record, after this patch the result was as expected: 7f78399c0000 interleave:0-3 anon=51403 dirty=51403 N0=12723 N1=12723 N2=13235 N3=12722 The simple testcase is like this: int main() { char *p; int i; int j; for (i=0; i < 200; i++) { p = (char *)malloc(1048576); printf("malloc done\n"); if (p == 0) { printf("Out of memory\n"); return 1; } for (j=0; j < 1048576; j++) { p[j] = 'A'; } printf("touched memory\n"); sleep(1); } printf("enter sleep\n"); while(1) { sleep(100); } } [akpm@linux-foundation.org: make last_khugepaged_target_node local to khugepaged_find_target_node()] Reported-by: Andrew Davidoff Tested-by: Andrew Davidoff Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Yasuaki Ishimatsu Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 53 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 9 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 411c4f2c049..0556c6a4495 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2191,7 +2191,34 @@ static void khugepaged_alloc_sleep(void) msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } +static int khugepaged_node_load[MAX_NUMNODES]; + #ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { if (IS_ERR(*hpage)) { @@ -2225,9 +2252,8 @@ static struct page * mmap_sem in read mode is good idea also to allow greater * scalability. */ - *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, - node, __GFP_OTHER_NODE); - + *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( + khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); /* * After allocating the hugepage, release the mmap_sem read lock in * preparation for taking it in write mode. @@ -2243,6 +2269,11 @@ static struct page return *hpage; } #else +static int khugepaged_find_target_node(void) +{ + return 0; +} + static inline struct page *alloc_hugepage(int defrag) { return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), @@ -2455,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (pmd_trans_huge(*pmd)) goto out; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -2471,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (unlikely(!page)) goto out_unmap; /* - * Chose the node of the first page. This could - * be more sophisticated and look at more pages, - * but isn't for now. + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. */ - if (node == NUMA_NO_NODE) - node = page_to_nid(page); + node = page_to_nid(page); + khugepaged_node_load[node]++; VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -2491,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { + node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, vma, node); + } out: return ret; } -- cgit v1.2.3-70-g09d2