diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 416 |
1 files changed, 138 insertions, 278 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f009a14918d..90dc501eaf3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2551,55 +2551,72 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } - -/* See mem_cgroup_try_charge() for details */ -enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ -}; - -static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) +/** + * mem_cgroup_try_charge - try charging a memcg + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * + * Returns 0 if @memcg was charged successfully, -EINTR if the charge + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. + */ +static int mem_cgroup_try_charge(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned int nr_pages) { - unsigned long csize = nr_pages * PAGE_SIZE; + unsigned int batch = max(CHARGE_BATCH, nr_pages); + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; + unsigned long nr_reclaimed; unsigned long flags = 0; - int ret; + unsigned long long size; + int ret = 0; - ret = res_counter_charge(&memcg->res, csize, &fail_res); +retry: + if (consume_stock(memcg, nr_pages)) + goto done; - if (likely(!ret)) { + size = batch * PAGE_SIZE; + if (!res_counter_charge(&memcg->res, size, &fail_res)) { if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; - - res_counter_uncharge(&memcg->res, csize); + goto done_restock; + if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + goto done_restock; + res_counter_uncharge(&memcg->res, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (nr_pages > min_pages) - return CHARGE_RETRY; + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; + goto nomem; - if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; + nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; + goto retry; + + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge @@ -2609,96 +2626,38 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - + if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; /* * At task move, charge accounts can be doubly counted. So, it's * better to wait until the end of task_move if something is going on. */ if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; -} - -/** - * mem_cgroup_try_charge - try charging a memcg - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns 0 if @memcg was charged successfully, -EINTR if the charge - * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. - */ -static int mem_cgroup_try_charge(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) -{ - unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - int ret; - - if (mem_cgroup_is_root(memcg)) - goto done; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) - goto bypass; + goto retry; - if (unlikely(task_in_memcg_oom(current))) - goto nomem; + if (nr_retries--) + goto retry; if (gfp_mask & __GFP_NOFAIL) - oom = false; -again: - if (consume_stock(memcg, nr_pages)) - goto done; - - do { - bool invoke_oom = oom && !nr_oom_retries; - - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) - goto bypass; + goto bypass; - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) - goto nomem; - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (fatal_signal_pending(current)) + goto bypass; - if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); -done: - return 0; + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: - return -EINTR; + memcg = root_mem_cgroup; + ret = -EINTR; + goto retry; + +done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); +done: + return ret; } /** @@ -2712,15 +2671,14 @@ bypass: */ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) + unsigned int nr_pages) { struct mem_cgroup *memcg; int ret; memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages); css_put(&memcg->css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -2738,13 +2696,11 @@ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); } /* @@ -2756,9 +2712,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, { unsigned long bytes = nr_pages * PAGE_SIZE; - if (mem_cgroup_is_root(memcg)) - return; - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); if (do_swap_account) res_counter_uncharge_until(&memcg->memsw, @@ -2842,14 +2795,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, } pc->mem_cgroup = memcg; - /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. - */ - smp_wmb(); SetPageCgroupUsed(pc); if (lrucare) { @@ -2937,8 +2882,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) if (ret) return ret; - ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, - oom_gfp_allowed(gfp)); + ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* * mem_cgroup_try_charge() chosed to bypass to root due to @@ -3463,12 +3407,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; } void __memcg_kmem_uncharge_pages(struct page *page, int order) @@ -3478,19 +3423,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) pc = lookup_page_cgroup(page); - /* - * Fast unlocked return. Theoretically might have changed, have to - * check again after locking. - */ if (!PageCgroupUsed(pc)) return; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0; /* * We trust that only if there is a memcg associated with the page, it @@ -3531,7 +3468,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], @@ -3687,7 +3623,6 @@ int mem_cgroup_charge_anon(struct page *page, { unsigned int nr_pages = 1; struct mem_cgroup *memcg; - bool oom = true; if (mem_cgroup_disabled()) return 0; @@ -3699,14 +3634,9 @@ int mem_cgroup_charge_anon(struct page *page, if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. - */ - oom = false; } - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages); if (!memcg) return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, nr_pages, @@ -3743,7 +3673,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); + ret = mem_cgroup_try_charge(memcg, mask, 1); css_put(&memcg->css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -3770,7 +3700,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, if (!PageSwapCache(page)) { struct mem_cgroup *memcg; - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); if (!memcg) return -ENOMEM; *memcgp = memcg; @@ -3839,7 +3769,7 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, return 0; } - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); if (!memcg) return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, 1, type, false); @@ -3993,7 +3923,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, * replacement page, so leave it alone when phasing out the * page that is unused after the migration. */ - if (!end_migration && !mem_cgroup_is_root(memcg)) + if (!end_migration) mem_cgroup_do_uncharge(memcg, nr_pages, ctype); return memcg; @@ -4126,8 +4056,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. This memcg can * be obsolete one. We avoid calling css_tryget_online(). */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -4817,78 +4746,24 @@ out: return retval; } - -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) -{ - struct mem_cgroup *iter; - long val = 0; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); - - if (val < 0) /* race ? */ - val = 0; - return val; -} - -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) -{ - u64 val; - - if (!mem_cgroup_is_root(memcg)) { - if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); - else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); - } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - - return val << PAGE_SHIFT; -} - static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; } #ifdef CONFIG_MEMCG_KMEM @@ -5350,7 +5225,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) if (!t) goto unlock; - usage = mem_cgroup_usage(memcg, swap); + if (!swap) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* * current_threshold points to threshold just below or equal to usage. @@ -5446,15 +5324,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -5534,18 +5412,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, int i, j, size; mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); if (!thresholds->primary) goto unlock; - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -6299,9 +6178,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + res_counter_init(&memcg->res, &root_mem_cgroup->res); + res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); + res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -6435,55 +6314,39 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ -#define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; + int ret; - if (mem_cgroup_is_root(memcg)) { + /* Try a single bulk charge without reclaim first */ + ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; - /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } - mc.precharge += count; + if (ret == -EINTR) { + __mem_cgroup_cancel_charge(root_mem_cgroup, count); return ret; } -one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); + ret = mem_cgroup_try_charge(mc.to, + GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. + */ + if (ret == -EINTR) + __mem_cgroup_cancel_charge(root_mem_cgroup, 1); if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; } /** @@ -6760,21 +6623,18 @@ static void __mem_cgroup_clear_mc(void) /* we must fixup refcnts and charges */ if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ - if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap); for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css); - if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } |