diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 431 |
1 files changed, 294 insertions, 137 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2efa8ea07ff..da53a252b25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; -static int really_do_swap_account __initdata = 1; /* for remember boot option*/ + +/* for remember boot option*/ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED +static int really_do_swap_account __initdata = 1; +#else +static int really_do_swap_account __initdata = 0; +#endif + #else #define do_swap_account (0) #endif @@ -278,7 +285,7 @@ enum move_type { /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { - spinlock_t lock; /* for from, to, moving_task */ + spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; unsigned long precharge; @@ -593,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, } static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, - struct page_cgroup *pc, - bool charge) + bool file, int nr_pages) { - int val = (charge) ? 1 : -1; - preempt_disable(); - if (PageCgroupCache(pc)) - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); + if (file) + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); else - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); - if (charge) + /* pagein of a big page is an event. So, ignore page size */ + if (nr_pages > 0) __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); - else + else { __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); - __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); + nr_pages = -nr_pages; /* for event */ + } + + __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); preempt_enable(); } @@ -808,12 +816,12 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); if (mem_cgroup_is_root(pc->mem_cgroup)) return; VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); - return; } void mem_cgroup_del_lru(struct page *page) @@ -830,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) + if (!PageCgroupUsed(pc)) + return; + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + if (mem_cgroup_is_root(pc->mem_cgroup)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]); @@ -851,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return; - + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) += 1; + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); if (mem_cgroup_is_root(pc->mem_cgroup)) return; @@ -1024,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return NULL; - + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); mz = page_cgroup_zoneinfo(pc); if (!mz) return NULL; @@ -1079,7 +1079,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* we don't affect global LRU but rotate in our LRU */ @@ -1113,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) return false; } +/** + * mem_cgroup_check_margin - check if the memory cgroup allows charging + * @mem: memory cgroup to check + * @bytes: the number of bytes the caller intends to charge + * + * Returns a boolean value on whether @mem can be charged @bytes or + * whether this would exceed the limit. + */ +static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +{ + if (!res_counter_check_margin(&mem->res, bytes)) + return false; + if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) + return false; + return true; +} + static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1304,8 +1321,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) u64 limit; u64 memsw; - limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + - total_swap_pages; + limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* * If memsw is finite and limits the amount of swap space available @@ -1592,11 +1610,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * possibility of race condition. If there is, we take a lock. */ -static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; + unsigned long uninitialized_var(flags); if (unlikely(!pc)) return; @@ -1606,39 +1626,36 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) if (unlikely(!mem || !PageCgroupUsed(pc))) goto out; /* pc->mem_cgroup is unstable ? */ - if (unlikely(mem_cgroup_stealed(mem))) { + if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { /* take a lock against to access pc->mem_cgroup */ - lock_page_cgroup(pc); + move_lock_page_cgroup(pc, &flags); need_unlock = true; mem = pc->mem_cgroup; if (!mem || !PageCgroupUsed(pc)) goto out; } - this_cpu_add(mem->stat->count[idx], val); - switch (idx) { - case MEM_CGROUP_STAT_FILE_MAPPED: + case MEMCG_NR_FILE_MAPPED: if (val > 0) SetPageCgroupFileMapped(pc); else if (!page_mapped(page)) ClearPageCgroupFileMapped(pc); + idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: BUG(); } + this_cpu_add(mem->stat->count[idx], val); + out: if (unlikely(need_unlock)) - unlock_page_cgroup(pc); + move_unlock_page_cgroup(pc, &flags); rcu_read_unlock(); return; } - -void mem_cgroup_update_file_mapped(struct page *page, int val) -{ - mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); -} +EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -1834,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, if (likely(!ret)) return CHARGE_OK; + res_counter_uncharge(&mem->res, csize); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - - if (csize > PAGE_SIZE) /* change csize and retry */ + /* + * csize can be either a huge page (HPAGE_SIZE), a batch of + * regular pages (CHARGE_SIZE), or a single regular page + * (PAGE_SIZE). + * + * Never reclaim on behalf of optional batching, retry with a + * single page instead. + */ + if (csize == CHARGE_SIZE) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags); + if (mem_cgroup_check_margin(mem_over_limit, csize)) + return CHARGE_RETRY; /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. */ - if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + if (csize == PAGE_SIZE && ret) return CHARGE_RETRY; /* @@ -1879,12 +1908,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, + struct mem_cgroup **memcg, bool oom, + int page_size) { int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = CHARGE_SIZE; + int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1909,7 +1940,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (consume_stock(mem)) + if (page_size == PAGE_SIZE && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1917,23 +1948,22 @@ again: rcu_read_lock(); p = rcu_dereference(mm->owner); - VM_BUG_ON(!p); /* - * because we don't have task_lock(), "p" can exit while - * we're here. In that case, "mem" can point to root - * cgroup but never be NULL. (and task_struct itself is freed - * by RCU, cgroup itself is RCU safe.) Then, we have small - * risk here to get wrong cgroup. But such kind of mis-account - * by race always happens because we don't have cgroup_mutex(). - * It's overkill and we allow that small race, here. + * Because we don't have task_lock(), "p" can exit. + * In that case, "mem" can point to root or p can be NULL with + * race with swapoff. Then, we have small risk of mis-accouning. + * But such kind of mis-account by race always happens because + * we don't have cgroup_mutex(). It's overkill and we allo that + * small race, here. + * (*) swapoff at el will charge against mm-struct not against + * task-struct. So, mm->owner can be NULL. */ mem = mem_cgroup_from_task(p); - VM_BUG_ON(!mem); - if (mem_cgroup_is_root(mem)) { + if (!mem || mem_cgroup_is_root(mem)) { rcu_read_unlock(); goto done; } - if (consume_stock(mem)) { + if (page_size == PAGE_SIZE && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1974,7 +2004,7 @@ again: case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = PAGE_SIZE; + csize = page_size; css_put(&mem->css); mem = NULL; goto again; @@ -1995,8 +2025,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > PAGE_SIZE) - refill_stock(mem, csize - PAGE_SIZE); + if (csize > page_size) + refill_stock(mem, csize - page_size); css_put(&mem->css); done: *memcg = mem; @@ -2024,9 +2054,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, + int page_size) { - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); } /* @@ -2076,15 +2107,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return mem; } -/* - * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be - * USED state. If already USED, uncharge and return. - */ - static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) { + int nr_pages = page_size >> PAGE_SHIFT; + /* try_charge() can return NULL to *memcg, taking care of it. */ if (!mem) return; @@ -2092,10 +2121,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, page_size); return; } - + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2119,8 +2151,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, break; } - mem_cgroup_charge_statistics(mem, pc, true); - + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); /* * "charge_statistics" updated event counter. Then, check it. @@ -2130,6 +2161,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, memcg_check_events(mem, pc->page); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ + (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) +/* + * Because tail pages are not marked as "used", set it. We're under + * zone->lru_lock, 'splitting on pmd' and compund_lock. + */ +void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +{ + struct page_cgroup *head_pc = lookup_page_cgroup(head); + struct page_cgroup *tail_pc = lookup_page_cgroup(tail); + unsigned long flags; + + if (mem_cgroup_disabled()) + return; + /* + * We have no races with charge/uncharge but will have races with + * page state accounting. + */ + move_lock_page_cgroup(head_pc, &flags); + + tail_pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb(); /* see __commit_charge() */ + if (PageCgroupAcctLRU(head_pc)) { + enum lru_list lru; + struct mem_cgroup_per_zone *mz; + + /* + * LRU flags cannot be copied because we need to add tail + *.page to LRU by generic call and our hook will be called. + * We hold lru_lock, then, reduce counter directly. + */ + lru = page_lru(head); + mz = page_cgroup_zoneinfo(head_pc); + MEM_CGROUP_ZSTAT(mz, lru) -= 1; + } + tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + move_unlock_page_cgroup(head_pc, &flags); +} +#endif + /** * __mem_cgroup_move_account - move account of the page * @pc: page_cgroup of the page. @@ -2148,11 +2221,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, */ static void __mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, + int charge_size) { + int nr_pages = charge_size >> PAGE_SHIFT; + VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); - VM_BUG_ON(!PageCgroupLocked(pc)); + VM_BUG_ON(!page_is_cgroup_locked(pc)); VM_BUG_ON(!PageCgroupUsed(pc)); VM_BUG_ON(pc->mem_cgroup != from); @@ -2163,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, pc, false); + mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from); + mem_cgroup_cancel_charge(from, charge_size); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, pc, true); + mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of @@ -2185,12 +2261,25 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, * __mem_cgroup_move_account() */ static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) + struct mem_cgroup *from, struct mem_cgroup *to, + bool uncharge, int charge_size) { int ret = -EINVAL; + unsigned long flags; + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) + return -EBUSY; + lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { - __mem_cgroup_move_account(pc, from, to, uncharge); + move_lock_page_cgroup(pc, &flags); + __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); + move_unlock_page_cgroup(pc, &flags); ret = 0; } unlock_page_cgroup(pc); @@ -2214,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; + int page_size = PAGE_SIZE; + unsigned long flags; int ret; /* Is ROOT ? */ @@ -2226,14 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (isolate_lru_page(page)) goto put; + if (PageTransHuge(page)) + page_size = HPAGE_SIZE; + parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, + &parent, false, page_size); if (ret || !parent) goto put_back; - ret = mem_cgroup_move_account(pc, child, parent, true); + if (page_size > PAGE_SIZE) + flags = compound_lock_irqsave(page); + + ret = mem_cgroup_move_account(pc, child, parent, true, page_size); if (ret) - mem_cgroup_cancel_charge(parent); + mem_cgroup_cancel_charge(parent, page_size); + + if (page_size > PAGE_SIZE) + compound_unlock_irqrestore(page, flags); put_back: putback_lru_page(page); put: @@ -2252,20 +2353,32 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; struct page_cgroup *pc; + bool oom = true; int ret; + if (PageTransHuge(page)) { + page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + /* + * Never OOM-kill a process for a huge page. The + * fault handler will fall back to regular pages. + */ + oom = false; + } + pc = lookup_page_cgroup(page); /* can happen at boot */ if (unlikely(!pc)) return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, page_size); return 0; } @@ -2274,8 +2387,6 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -2381,13 +2492,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); } static void @@ -2403,7 +2514,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); + __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2452,11 +2563,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, PAGE_SIZE); } static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, + int page_size) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; @@ -2483,6 +2595,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; + if (page_size != PAGE_SIZE) + goto direct_uncharge; + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. @@ -2496,9 +2611,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) batch->memsw_bytes += PAGE_SIZE; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2510,8 +2625,10 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { + int count; struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; if (mem_cgroup_disabled()) return NULL; @@ -2519,6 +2636,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; + if (PageTransHuge(page)) { + page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } + + count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2551,7 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, pc, false); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); ClearPageCgroupUsed(pc); /* @@ -2572,7 +2695,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + __do_uncharge(mem, ctype, page_size); return mem; @@ -2767,6 +2890,7 @@ int mem_cgroup_prepare_migration(struct page *page, enum charge_type ctype; int ret = 0; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2816,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2843,13 +2967,13 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); return ret; } /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; @@ -2858,8 +2982,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, return; /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); - /* at migration success, oldpage->mapping is NULL. */ - if (oldpage->mapping) { + if (!migration_ok) { used = oldpage; unused = newpage; } else { @@ -4169,13 +4292,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) */ if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; - pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); if (!pn) return 1; mem->info.nodeinfo[node] = pn; - memset(pn, 0, sizeof(*pn)); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) @@ -4199,14 +4320,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) - mem = kmalloc(size, GFP_KERNEL); + mem = kzalloc(size, GFP_KERNEL); else - mem = vmalloc(size); + mem = vzalloc(size); if (!mem) return NULL; - memset(mem, 0, size); mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!mem->stat) goto out_free; @@ -4454,7 +4574,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + PAGE_SIZE); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4616,6 +4737,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4653,10 +4775,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) static int mem_cgroup_precharge_mc(struct mm_struct *mm) { - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); } -static void mem_cgroup_clear_mc(void) +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; @@ -4691,18 +4818,28 @@ static void mem_cgroup_clear_mc(void) PAGE_SIZE * mc.moved_swap); } /* we've already done mem_cgroup_get(mc.to) */ - mc.moved_swap = 0; } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; - mc.moving_task = NULL; spin_unlock(&mc.lock); mem_cgroup_end_move(from); - memcg_oom_recover(from); - memcg_oom_recover(to); - wake_up_all(&mc.waitq); } static int mem_cgroup_can_attach(struct cgroup_subsys *ss, @@ -4729,16 +4866,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - VM_BUG_ON(mc.moving_task); mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; mc.to = mem; - mc.precharge = 0; - mc.moved_charge = 0; - mc.moved_swap = 0; - mc.moving_task = current; spin_unlock(&mc.lock); + /* We set mc.moving_task later */ ret = mem_cgroup_precharge_mc(mm); if (ret) @@ -4767,6 +4900,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; retry: + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); @@ -4787,7 +4921,7 @@ retry: goto put; pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(pc, - mc.from, mc.to, false)) { + mc.from, mc.to, false, PAGE_SIZE)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -4832,7 +4966,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - down_read(&mm->mmap_sem); +retry: + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + /* + * Someone who are holding the mmap_sem might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4911,10 +5057,21 @@ struct cgroup_subsys mem_cgroup_subsys = { }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +static int __init enable_swap_account(char *s) +{ + /* consider enabled if no parameter or 1 is given */ + if (!(*s) || !strcmp(s, "=1")) + really_do_swap_account = 1; + else if (!strcmp(s, "=0")) + really_do_swap_account = 0; + return 1; +} +__setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - really_do_swap_account = 0; + printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); + enable_swap_account("=0"); return 1; } __setup("noswapaccount", disable_swap_account); |