From ebb76ce16daf6908dc030dec1c00827d37129fe5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 29 Dec 2010 14:07:11 -0800 Subject: memcg: fix wrong VM_BUG_ON() in try_charge()'s mm->owner check At __mem_cgroup_try_charge(), VM_BUG_ON(!mm->owner) is checked. But as commented in mem_cgroup_from_task(), mm->owner can be NULL in some racy case. This check of VM_BUG_ON() is bad. A possible story to hit this is at swapoff()->try_to_unuse(). It passes mm_struct to mem_cgroup_try_charge_swapin() while mm->owner is NULL. If we can't get proper mem_cgroup from swap_cgroup information, mm->owner is used as charge target and we see NULL. Cc: Daisuke Nishimura Cc: KOSAKI Motohiro Reported-by: Hugh Dickins Reported-by: Thomas Meyer Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Balbir Singh Signed-off-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7a22b412921..00bb8a64d02 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1925,19 +1925,18 @@ again: rcu_read_lock(); p = rcu_dereference(mm->owner); - VM_BUG_ON(!p); /* - * because we don't have task_lock(), "p" can exit while - * we're here. In that case, "mem" can point to root - * cgroup but never be NULL. (and task_struct itself is freed - * by RCU, cgroup itself is RCU safe.) Then, we have small - * risk here to get wrong cgroup. But such kind of mis-account - * by race always happens because we don't have cgroup_mutex(). - * It's overkill and we allow that small race, here. + * Because we don't have task_lock(), "p" can exit. + * In that case, "mem" can point to root or p can be NULL with + * race with swapoff. Then, we have small risk of mis-accouning. + * But such kind of mis-account by race always happens because + * we don't have cgroup_mutex(). It's overkill and we allo that + * small race, here. + * (*) swapoff at el will charge against mm-struct not against + * task-struct. So, mm->owner can be NULL. */ mem = mem_cgroup_from_task(p); - VM_BUG_ON(!mem); - if (mem_cgroup_is_root(mem)) { + if (!mem || mem_cgroup_is_root(mem)) { rcu_read_unlock(); goto done; } -- cgit v1.2.3-70-g09d2 From ec1685109f1314a30919489ef2800ed626a38c1e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:56 -0800 Subject: thp: memcg compound Teach memcg to charge/uncharge compound pages. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 83 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 30 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00bb8a64d02..356d4964fe9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1027,6 +1027,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); if (mem_cgroup_disabled()) return NULL; @@ -1887,12 +1891,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, + struct mem_cgroup **memcg, bool oom, + int page_size) { int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = CHARGE_SIZE; + int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1917,7 +1923,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (consume_stock(mem)) + if (page_size == PAGE_SIZE && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1940,7 +1946,7 @@ again: rcu_read_unlock(); goto done; } - if (consume_stock(mem)) { + if (page_size == PAGE_SIZE && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1981,7 +1987,7 @@ again: case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = PAGE_SIZE; + csize = page_size; css_put(&mem->css); mem = NULL; goto again; @@ -2002,8 +2008,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > PAGE_SIZE) - refill_stock(mem, csize - PAGE_SIZE); + if (csize > page_size) + refill_stock(mem, csize - page_size); css_put(&mem->css); done: *memcg = mem; @@ -2031,9 +2037,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, + int page_size) { - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); } /* @@ -2089,8 +2096,9 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) */ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) { /* try_charge() can return NULL to *memcg, taking care of it. */ if (!mem) @@ -2099,7 +2107,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, page_size); return; } @@ -2173,7 +2181,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, mem_cgroup_charge_statistics(from, pc, false); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from); + mem_cgroup_cancel_charge(from, PAGE_SIZE); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -2234,13 +2242,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto put; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, + PAGE_SIZE); if (ret || !parent) goto put_back; ret = mem_cgroup_move_account(pc, child, parent, true); if (ret) - mem_cgroup_cancel_charge(parent); + mem_cgroup_cancel_charge(parent, PAGE_SIZE); put_back: putback_lru_page(page); put: @@ -2261,6 +2270,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, struct mem_cgroup *mem = NULL; struct page_cgroup *pc; int ret; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -2268,11 +2281,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, page_size); return 0; } @@ -2281,8 +2294,6 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -2388,13 +2399,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); } static void @@ -2410,7 +2421,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); + __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2459,11 +2470,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, PAGE_SIZE); } static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, + int page_size) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; @@ -2490,6 +2502,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; + if (page_size != PAGE_SIZE) + goto direct_uncharge; + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. @@ -2503,9 +2518,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) batch->memsw_bytes += PAGE_SIZE; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2519,6 +2534,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; if (mem_cgroup_disabled()) return NULL; @@ -2526,6 +2542,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; + if (PageTransHuge(page)) + page_size <<= compound_order(page); + /* * Check if our page_cgroup is valid */ @@ -2579,7 +2598,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + __do_uncharge(mem, ctype, page_size); return mem; @@ -2774,6 +2793,7 @@ int mem_cgroup_prepare_migration(struct page *page, enum charge_type ctype; int ret = 0; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2823,7 +2843,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2850,7 +2870,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); return ret; } @@ -4461,7 +4481,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + PAGE_SIZE); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4623,6 +4644,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4789,6 +4811,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; retry: + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); -- cgit v1.2.3-70-g09d2 From 152c9ccb75548c027fa3103efa4fa4e19a345449 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:46:56 -0800 Subject: thp: transhuge-memcg: commit tail pages at charge By this patch, when a transparent hugepage is charged, not only the head page but also all the tail pages are committed, IOW pc->mem_cgroup and pc->flags of tail pages are set. Without this patch: - Tail pages are not linked to any memcg's LRU at splitting. This causes many problems, for example, the charged memcg's directory can never be rmdir'ed because it doesn't have enough pages to scan to make the usage decrease to 0. - "rss" field in memory.stat would be incorrect. Moreover, usage_in_bytes in root cgroup is calculated by the stat not by res_counter(since 2.6.32), it would be incorrect too. Signed-off-by: Daisuke Nishimura Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 52 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 17 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 356d4964fe9..a1bb59d4c9d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2094,23 +2094,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be * USED state. If already USED, uncharge and return. */ - -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype, - int page_size) +static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype) { - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - - lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem, page_size); - return; - } - pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2135,6 +2122,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, } mem_cgroup_charge_statistics(mem, pc, true); +} + +static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) +{ + int i; + int count = page_size >> PAGE_SHIFT; + + /* try_charge() can return NULL to *memcg, taking care of it. */ + if (!mem) + return; + + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); + mem_cgroup_cancel_charge(mem, page_size); + return; + } + + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ + for (i = 0; i < count; i++) + ____mem_cgroup_commit_charge(mem, pc + i, ctype); unlock_page_cgroup(pc); /* @@ -2532,6 +2546,8 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { + int i; + int count; struct page_cgroup *pc; struct mem_cgroup *mem = NULL; int page_size = PAGE_SIZE; @@ -2545,6 +2561,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageTransHuge(page)) page_size <<= compound_order(page); + count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2577,7 +2594,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, pc, false); + for (i = 0; i < count; i++) + mem_cgroup_charge_statistics(mem, pc + i, false); ClearPageCgroupUsed(pc); /* -- cgit v1.2.3-70-g09d2 From 2c888cfbc1b45508a44763d85ba2e8ac43faff5f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:47:13 -0800 Subject: thp: fix anon memory statistics with transparent hugepages Count each transparent hugepage as HPAGE_PMD_NR pages in the LRU statistics, so the Active(anon) and Inactive(anon) statistics in /proc/meminfo are correct. Signed-off-by: Rik van Riel Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 8 ++++++++ include/linux/mm_inline.h | 8 +++++--- mm/huge_memory.c | 10 ++++++++++ mm/memcontrol.c | 2 +- mm/vmscan.c | 11 ++++++----- 5 files changed, 30 insertions(+), 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 82759522873..9b48c24df26 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -117,11 +117,19 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, return; __vma_adjust_trans_huge(vma, start, end, adjust_next); } +static inline int hpage_nr_pages(struct page *page) +{ + if (unlikely(PageTransHuge(page))) + return HPAGE_PMD_NR; + return 1; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUG(); 0; }) +#define hpage_nr_pages(x) 1 + #define transparent_hugepage_enabled(__vma) 0 #define transparent_hugepage_flags 0UL diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 650f31eabdb..8f7d24712dc 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,6 +1,8 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include + /** * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test @@ -24,7 +26,7 @@ __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, struct list_head *head) { list_add(&page->lru, head); - __inc_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); mem_cgroup_add_lru_list(page, l); } @@ -38,7 +40,7 @@ static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } @@ -73,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page) l += LRU_ACTIVE; } } - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 892d8a17a7e..f4f6041176a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1143,6 +1143,7 @@ static void __split_huge_page_refcount(struct page *page) int i; unsigned long head_index = page->index; struct zone *zone = page_zone(page); + int zonestat; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1207,6 +1208,15 @@ static void __split_huge_page_refcount(struct page *page) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + /* + * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, + * so adjust those appropriately if this page is on the LRU. + */ + if (PageLRU(page)) { + zonestat = NR_LRU_BASE + page_lru(page); + __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); + } + ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1bb59d4c9d..f4ea3410fb4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1091,7 +1091,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* we don't affect global LRU but rotate in our LRU */ diff --git a/mm/vmscan.c b/mm/vmscan.c index f5b762ae23a..0882014d2ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1045,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: @@ -1103,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken++; + nr_taken += hpage_nr_pages(page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; @@ -1158,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); - nr_active++; + nr_active += numpages; } if (count) - count[lru]++; + count[lru] += numpages; } return nr_active; @@ -1483,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone, list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); - pgmoved++; + pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { spin_unlock_irq(&zone->lru_lock); -- cgit v1.2.3-70-g09d2 From 37c2ac7872a9387542616f658d20ac25f5bdb32e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:16 -0800 Subject: thp: compound_trans_order Read compound_trans_order safe. Noop for CONFIG_TRANSPARENT_HUGEPAGE=n. Signed-off-by: Andrea Arcangeli Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 ++++++++++++++ mm/memcontrol.c | 12 ++++++------ mm/memory-failure.c | 12 ++++++------ 3 files changed, 26 insertions(+), 12 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c2695beab8..ce97a2bb0b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -450,6 +450,20 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } +static inline int compound_trans_order(struct page *page) +{ + int order; + unsigned long flags; + + if (!PageHead(page)) + return 0; + + flags = compound_lock_irqsave(page); + order = compound_order(page); + compound_unlock_irqrestore(page, flags); + return order; +} + static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f4ea3410fb4..741206ffdac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1027,10 +1027,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; - int page_size = PAGE_SIZE; - - if (PageTransHuge(page)) - page_size <<= compound_order(page); if (mem_cgroup_disabled()) return NULL; @@ -2286,8 +2282,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, int ret; int page_size = PAGE_SIZE; - if (PageTransHuge(page)) + if (PageTransHuge(page)) { page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -2558,8 +2556,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; - if (PageTransHuge(page)) + if (PageTransHuge(page)) { page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } count = page_size >> PAGE_SHIFT; /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1b43d0ffff6..548fbd70f02 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -930,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -938,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -968,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_order(hpage); + nr_pages = 1 << compound_trans_order(hpage); atomic_long_add(nr_pages, &mce_bad_pages); /* @@ -1166,7 +1166,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); + nr_pages = 1 << compound_trans_order(page); if (!get_page_unless_zero(page)) { /* @@ -1304,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags) } done: if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); /* keep elevated page count for bad page */ -- cgit v1.2.3-70-g09d2 From 2a7106f2cb0768d00fe8c1eb42a754a7d8518f08 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 13 Jan 2011 15:47:37 -0800 Subject: memcg: create extensible page stat update routines Replace usage of the mem_cgroup_update_file_mapped() memcg statistic update routine with two new routines: * mem_cgroup_inc_page_stat() * mem_cgroup_dec_page_stat() As before, only the file_mapped statistic is managed. However, these more general interfaces allow for new statistics to be more easily added. New statistics are added with memcg dirty page accounting. Signed-off-by: Greg Thelen Signed-off-by: Andrea Righi Acked-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 31 ++++++++++++++++++++++++++++--- mm/memcontrol.c | 16 +++++++--------- mm/rmap.c | 4 ++-- 3 files changed, 37 insertions(+), 14 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 159a0762aea..067115ce6b3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,6 +25,11 @@ struct page_cgroup; struct page; struct mm_struct; +/* Stats that can be updated by kernel. */ +enum mem_cgroup_page_stat_item { + MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ +}; + extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void) return false; } -void mem_cgroup_update_file_mapped(struct page *page, int val); +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, + int val); + +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, 1); +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, -1); +} + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); @@ -293,8 +313,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_update_file_mapped(struct page *page, - int val) +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 741206ffdac..3d8a0c79dec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1600,7 +1600,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * possibility of race condition. If there is, we take a lock. */ -static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1623,30 +1624,27 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) goto out; } - this_cpu_add(mem->stat->count[idx], val); - switch (idx) { - case MEM_CGROUP_STAT_FILE_MAPPED: + case MEMCG_NR_FILE_MAPPED: if (val > 0) SetPageCgroupFileMapped(pc); else if (!page_mapped(page)) ClearPageCgroupFileMapped(pc); + idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: BUG(); } + this_cpu_add(mem->stat->count[idx], val); + out: if (unlikely(need_unlock)) unlock_page_cgroup(pc); rcu_read_unlock(); return; } - -void mem_cgroup_update_file_mapped(struct page *page, int val) -{ - mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); -} +EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. diff --git a/mm/rmap.c b/mm/rmap.c index c30f33854f9..f21f4a1d6a1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -937,7 +937,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, 1); + mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } } @@ -979,7 +979,7 @@ void page_remove_rmap(struct page *page) NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, -1); + mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); } /* * It would be tidy to reset the PageAnon mapping here, -- cgit v1.2.3-70-g09d2 From dbd4ea78f002df283c95d9774837041735fa1bf9 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 13 Jan 2011 15:47:38 -0800 Subject: memcg: add lock to synchronize page accounting and migration Introduce a new bit spin lock, PCG_MOVE_LOCK, to synchronize the page accounting and migration code. This reworks the locking scheme of _update_stat() and _move_account() by adding new lock bit PCG_MOVE_LOCK, which is always taken under IRQ disable. 1. If pages are being migrated from a memcg, then updates to that memcg page statistics are protected by grabbing PCG_MOVE_LOCK using move_lock_page_cgroup(). In an upcoming commit, memcg dirty page accounting will be updating memcg page accounting (specifically: num writeback pages) from IRQ context (softirq). Avoid a deadlocking nested spin lock attempt by disabling irq on the local processor when grabbing the PCG_MOVE_LOCK. 2. lock for update_page_stat is used only for avoiding race with move_account(). So, IRQ awareness of lock_page_cgroup() itself is not a problem. The problem is between mem_cgroup_update_page_stat() and mem_cgroup_move_account_page(). Trade-off: * Changing lock_page_cgroup() to always disable IRQ (or local_bh) has some impacts on performance and I think it's bad to disable IRQ when it's not necessary. * adding a new lock makes move_account() slower. Score is here. Performance Impact: moving a 8G anon process. Before: real 0m0.792s user 0m0.000s sys 0m0.780s After: real 0m0.854s user 0m0.000s sys 0m0.842s This score is bad but planned patches for optimization can reduce this impact. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Greg Thelen Reviewed-by: Minchan Kim Acked-by: Daisuke Nishimura Cc: Andrea Righi Cc: Balbir Singh Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 31 ++++++++++++++++++++++++++++--- mm/memcontrol.c | 9 +++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index fdb5a92b5ac..5b0c971d7ca 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -35,15 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page); enum { /* flags for mem_cgroup */ - PCG_LOCK, /* page cgroup is locked */ + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ - PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATION, /* under page migration */ + /* flags for mem_cgroup and file and I/O status */ + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ PCG_FILE_DIRTY, /* page is dirty */ PCG_FILE_WRITEBACK, /* page is under writeback */ PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ - PCG_MIGRATION, /* under page migration */ + /* No lock in page_cgroup */ + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ }; #define TESTPCGFLAG(uname, lname) \ @@ -117,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) static inline void lock_page_cgroup(struct page_cgroup *pc) { + /* + * Don't take this lock in IRQ context. + * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION + */ bit_spin_lock(PCG_LOCK, &pc->flags); } @@ -130,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc) return bit_spin_is_locked(PCG_LOCK, &pc->flags); } +static inline void move_lock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + /* + * We know updates to pc->flags of page cache's stats are from both of + * usual context or IRQ context. Disable IRQ to avoid deadlock. + */ + local_irq_save(*flags); + bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); +} + +static inline void move_unlock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); + local_irq_restore(*flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3d8a0c79dec..d888956a2cf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1606,6 +1606,7 @@ void mem_cgroup_update_page_stat(struct page *page, struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; + unsigned long uninitialized_var(flags); if (unlikely(!pc)) return; @@ -1617,7 +1618,7 @@ void mem_cgroup_update_page_stat(struct page *page, /* pc->mem_cgroup is unstable ? */ if (unlikely(mem_cgroup_stealed(mem))) { /* take a lock against to access pc->mem_cgroup */ - lock_page_cgroup(pc); + move_lock_page_cgroup(pc, &flags); need_unlock = true; mem = pc->mem_cgroup; if (!mem || !PageCgroupUsed(pc)) @@ -1640,7 +1641,7 @@ void mem_cgroup_update_page_stat(struct page *page, out: if (unlikely(need_unlock)) - unlock_page_cgroup(pc); + move_unlock_page_cgroup(pc, &flags); rcu_read_unlock(); return; } @@ -2211,9 +2212,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { int ret = -EINVAL; + unsigned long flags; + lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { + move_lock_page_cgroup(pc, &flags); __mem_cgroup_move_account(pc, from, to, uncharge); + move_unlock_page_cgroup(pc, &flags); ret = 0; } unlock_page_cgroup(pc); -- cgit v1.2.3-70-g09d2 From f3e8eb70b1807d1b30aa6972af0cf30077c40112 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:39 -0800 Subject: memcg: fix unit mismatch in memcg oom limit calculation Adding the number of swap pages to the byte limit of a memory control group makes no sense. Convert the pages to bytes before adding them. The only user of this code is the OOM killer, and the way it is used means that the error results in a higher OOM badness value. Since the cgroup limit is the same for all tasks in the cgroup, the error should have no practical impact at the moment. But let's not wait for future or changing users to trip over it. Signed-off-by: Johannes Weiner Cc: Greg Thelen Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d888956a2cf..9f8ad77f091 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1312,8 +1312,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) u64 limit; u64 memsw; - limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + - total_swap_pages; + limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* * If memsw is finite and limits the amount of swap space available -- cgit v1.2.3-70-g09d2 From 043d18b1e5bdfc4870b8a19d00f0d5c636a5c231 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 13 Jan 2011 15:47:40 -0800 Subject: memcg: remove unnecessary return from void-returning mem_cgroup_del_lru_list() Signed-off-by: Minchan Kim Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9f8ad77f091..1b44ad64f28 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -821,7 +821,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) return; VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); - return; } void mem_cgroup_del_lru(struct page *page) -- cgit v1.2.3-70-g09d2 From dfe076b0971a783469bc2066e85d46e23c8acb1c Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:47:41 -0800 Subject: memcg: fix deadlock between cpuset and memcg Commit b1dd693e ("memcg: avoid deadlock between move charge and try_charge()") can cause another deadlock about mmap_sem on task migration if cpuset and memcg are mounted onto the same mount point. After the commit, cgroup_attach_task() has sequence like: cgroup_attach_task() ss->can_attach() cpuset_can_attach() mem_cgroup_can_attach() down_read(&mmap_sem) (1) ss->attach() cpuset_attach() mpol_rebind_mm() down_write(&mmap_sem) (2) up_write(&mmap_sem) cpuset_migrate_mm() do_migrate_pages() down_read(&mmap_sem) up_read(&mmap_sem) mem_cgroup_move_task() mem_cgroup_clear_mc() up_read(&mmap_sem) We can cause deadlock at (2) because we've already aquire the mmap_sem at (1). But the commit itself is necessary to fix deadlocks which have existed before the commit like: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() | -> move charge should wake it up Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() | -> move charge should wake it up This patch fixes all of these problems by: 1. revert the commit. 2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge() has released the mmap_sem. 3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel all extra charges, wake up all waiters, and retry trylock. Signed-off-by: Daisuke Nishimura Reported-by: Ben Blum Cc: Miao Xie Cc: David Rientjes Cc: Paul Menage Cc: Hiroyuki Kamezawa Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 84 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 35 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b44ad64f28..c339d7431bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -292,7 +292,6 @@ static struct move_charge_struct { unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ - struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -4681,7 +4680,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - /* We've already held the mmap_sem */ + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4693,6 +4692,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } + up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4702,10 +4702,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) static int mem_cgroup_precharge_mc(struct mm_struct *mm) { - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); } -static void mem_cgroup_clear_mc(void) +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; @@ -4740,23 +4745,28 @@ static void mem_cgroup_clear_mc(void) PAGE_SIZE * mc.moved_swap); } /* we've already done mem_cgroup_get(mc.to) */ - mc.moved_swap = 0; } - if (mc.mm) { - up_read(&mc.mm->mmap_sem); - mmput(mc.mm); - } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; spin_unlock(&mc.lock); - mc.moving_task = NULL; - mc.mm = NULL; mem_cgroup_end_move(from); - memcg_oom_recover(from); - memcg_oom_recover(to); - wake_up_all(&mc.waitq); } static int mem_cgroup_can_attach(struct cgroup_subsys *ss, @@ -4778,38 +4788,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { - /* - * We do all the move charge works under one mmap_sem to - * avoid deadlock with down_write(&mmap_sem) - * -> try_charge() -> if (mc.moving_task) -> sleep. - */ - down_read(&mm->mmap_sem); - VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - VM_BUG_ON(mc.moving_task); - VM_BUG_ON(mc.mm); - mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; mc.to = mem; - mc.precharge = 0; - mc.moved_charge = 0; - mc.moved_swap = 0; spin_unlock(&mc.lock); - mc.moving_task = current; - mc.mm = mm; + /* We set mc.moving_task later */ ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - /* We call up_read() and mmput() in clear_mc(). */ - } else - mmput(mm); + } + mmput(mm); } return ret; } @@ -4898,7 +4893,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - /* We've already held the mmap_sem */ +retry: + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + /* + * Someone who are holding the mmap_sem might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4917,6 +4924,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } + up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4925,11 +4933,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - if (!mc.mm) + struct mm_struct *mm; + + if (!mc.to) /* no need to move charge */ return; - mem_cgroup_move_charge(mc.mm); + mm = get_task_mm(p); + if (mm) { + mem_cgroup_move_charge(mm); + mmput(mm); + } mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -- cgit v1.2.3-70-g09d2 From 17295c88a160c6eea3fcf46cec9d08a0fcb02db9 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 13 Jan 2011 15:47:42 -0800 Subject: memcg: use [kv]zalloc[_node] rather than [kv]malloc+memset In mem_cgroup_alloc() we currently do either kmalloc() or vmalloc() then followed by memset() to zero the memory. This can be more efficiently achieved by using kzalloc() and vzalloc(). There's also one situation where we can use kzalloc_node() - this is what's new in this version of the patch. Signed-off-by: Jesper Juhl Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Wu Fengguang Cc: Balbir Singh Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c339d7431bd..6424ba0fce8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4216,13 +4216,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) */ if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; - pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); if (!pn) return 1; mem->info.nodeinfo[node] = pn; - memset(pn, 0, sizeof(*pn)); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) @@ -4246,14 +4244,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) - mem = kmalloc(size, GFP_KERNEL); + mem = kzalloc(size, GFP_KERNEL); else - mem = vmalloc(size); + mem = vzalloc(size); if (!mem) return NULL; - memset(mem, 0, size); mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!mem->stat) goto out_free; -- cgit v1.2.3-70-g09d2 From 50de1dd967d4ba3b8a90ebe7a4f5feca24191317 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:47:43 -0800 Subject: memcg: fix memory migration of shmem swapcache In the current implementation mem_cgroup_end_migration() decides whether the page migration has succeeded or not by checking "oldpage->mapping". But if we are tring to migrate a shmem swapcache, the page->mapping of it is NULL from the begining, so the check would be invalid. As a result, mem_cgroup_end_migration() assumes the migration has succeeded even if it's not, so "newpage" would be freed while it's not uncharged. This patch fixes it by passing mem_cgroup_end_migration() the result of the page migration. Signed-off-by: Daisuke Nishimura Reviewed-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Minchan Kim Reviewed-by: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- mm/memcontrol.c | 5 ++--- mm/migrate.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 067115ce6b3..6a576f98943 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -98,7 +98,7 @@ extern int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage); + struct page *oldpage, struct page *newpage, bool migration_ok); /* * For memory reclaim. @@ -251,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, } static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, - struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6424ba0fce8..8ab84103143 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2896,7 +2896,7 @@ int mem_cgroup_prepare_migration(struct page *page, /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; @@ -2905,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, return; /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); - /* at migration success, oldpage->mapping is NULL. */ - if (oldpage->mapping) { + if (!migration_ok) { used = oldpage; unused = newpage; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 5b7d1fd2962..46fe8cc13d6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -768,7 +768,7 @@ skip_unmap: uncharge: if (!charge) - mem_cgroup_end_migration(mem, page, newpage); + mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); -- cgit v1.2.3-70-g09d2