summaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c431
1 files changed, 294 insertions, 137 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2efa8ea07ff..da53a252b25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
-static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+
+/* for remember boot option*/
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata = 0;
+#endif
+
#else
#define do_swap_account (0)
#endif
@@ -278,7 +285,7 @@ enum move_type {
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
- spinlock_t lock; /* for from, to, moving_task */
+ spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
@@ -593,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- bool charge)
+ bool file, int nr_pages)
{
- int val = (charge) ? 1 : -1;
-
preempt_disable();
- if (PageCgroupCache(pc))
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
+ if (file)
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
else
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
- if (charge)
+ /* pagein of a big page is an event. So, ignore page size */
+ if (nr_pages > 0)
__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
- else
+ else {
__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
+ nr_pages = -nr_pages; /* for event */
+ }
+
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
preempt_enable();
}
@@ -808,12 +816,12 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
* removed from global LRU.
*/
mz = page_cgroup_zoneinfo(pc);
- MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ /* huge page split is done under lru_lock. so, we have no races. */
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
- return;
}
void mem_cgroup_del_lru(struct page *page)
@@ -830,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
return;
pc = lookup_page_cgroup(page);
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
/* unused or root page is not rotated. */
- if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+ if (!PageCgroupUsed(pc))
+ return;
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
+ if (mem_cgroup_is_root(pc->mem_cgroup))
return;
mz = page_cgroup_zoneinfo(pc);
list_move(&pc->lru, &mz->lists[lru]);
@@ -851,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
return;
pc = lookup_page_cgroup(page);
VM_BUG_ON(PageCgroupAcctLRU(pc));
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
if (!PageCgroupUsed(pc))
return;
-
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
mz = page_cgroup_zoneinfo(pc);
- MEM_CGROUP_ZSTAT(mz, lru) += 1;
+ /* huge page split is done under lru_lock. so, we have no races. */
+ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
SetPageCgroupAcctLRU(pc);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
@@ -1024,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
return NULL;
pc = lookup_page_cgroup(page);
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
if (!PageCgroupUsed(pc))
return NULL;
-
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
mz = page_cgroup_zoneinfo(pc);
if (!mz)
return NULL;
@@ -1079,7 +1079,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
@@ -1113,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
return false;
}
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+ if (!res_counter_check_margin(&mem->res, bytes))
+ return false;
+ if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+ return false;
+ return true;
+}
+
static unsigned int get_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
@@ -1304,8 +1321,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
u64 limit;
u64 memsw;
- limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
- total_swap_pages;
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ limit += total_swap_pages << PAGE_SHIFT;
+
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
@@ -1592,11 +1610,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
* possibility of race condition. If there is, we take a lock.
*/
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
+ unsigned long uninitialized_var(flags);
if (unlikely(!pc))
return;
@@ -1606,39 +1626,36 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
if (unlikely(!mem || !PageCgroupUsed(pc)))
goto out;
/* pc->mem_cgroup is unstable ? */
- if (unlikely(mem_cgroup_stealed(mem))) {
+ if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
/* take a lock against to access pc->mem_cgroup */
- lock_page_cgroup(pc);
+ move_lock_page_cgroup(pc, &flags);
need_unlock = true;
mem = pc->mem_cgroup;
if (!mem || !PageCgroupUsed(pc))
goto out;
}
- this_cpu_add(mem->stat->count[idx], val);
-
switch (idx) {
- case MEM_CGROUP_STAT_FILE_MAPPED:
+ case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
}
+ this_cpu_add(mem->stat->count[idx], val);
+
out:
if (unlikely(need_unlock))
- unlock_page_cgroup(pc);
+ move_unlock_page_cgroup(pc, &flags);
rcu_read_unlock();
return;
}
-
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
- mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1834,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
if (likely(!ret))
return CHARGE_OK;
+ res_counter_uncharge(&mem->res, csize);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
-
- if (csize > PAGE_SIZE) /* change csize and retry */
+ /*
+ * csize can be either a huge page (HPAGE_SIZE), a batch of
+ * regular pages (CHARGE_SIZE), or a single regular page
+ * (PAGE_SIZE).
+ *
+ * Never reclaim on behalf of optional batching, retry with a
+ * single page instead.
+ */
+ if (csize == CHARGE_SIZE)
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
+ gfp_mask, flags);
+ if (mem_cgroup_check_margin(mem_over_limit, csize))
+ return CHARGE_RETRY;
/*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
+ * Even though the limit is exceeded at this point, reclaim
+ * may have been able to free some pages. Retry the charge
+ * before killing the task.
+ *
+ * Only for regular pages, though: huge pages are rather
+ * unlikely to succeed so close to the limit, and we fall back
+ * to regular pages anyway in case of failure.
*/
- if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ if (csize == PAGE_SIZE && ret)
return CHARGE_RETRY;
/*
@@ -1879,12 +1908,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask,
+ struct mem_cgroup **memcg, bool oom,
+ int page_size)
{
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem = NULL;
int ret;
- int csize = CHARGE_SIZE;
+ int csize = max(CHARGE_SIZE, (unsigned long) page_size);
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1909,7 +1940,7 @@ again:
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
- if (consume_stock(mem))
+ if (page_size == PAGE_SIZE && consume_stock(mem))
goto done;
css_get(&mem->css);
} else {
@@ -1917,23 +1948,22 @@ again:
rcu_read_lock();
p = rcu_dereference(mm->owner);
- VM_BUG_ON(!p);
/*
- * because we don't have task_lock(), "p" can exit while
- * we're here. In that case, "mem" can point to root
- * cgroup but never be NULL. (and task_struct itself is freed
- * by RCU, cgroup itself is RCU safe.) Then, we have small
- * risk here to get wrong cgroup. But such kind of mis-account
- * by race always happens because we don't have cgroup_mutex().
- * It's overkill and we allow that small race, here.
+ * Because we don't have task_lock(), "p" can exit.
+ * In that case, "mem" can point to root or p can be NULL with
+ * race with swapoff. Then, we have small risk of mis-accouning.
+ * But such kind of mis-account by race always happens because
+ * we don't have cgroup_mutex(). It's overkill and we allo that
+ * small race, here.
+ * (*) swapoff at el will charge against mm-struct not against
+ * task-struct. So, mm->owner can be NULL.
*/
mem = mem_cgroup_from_task(p);
- VM_BUG_ON(!mem);
- if (mem_cgroup_is_root(mem)) {
+ if (!mem || mem_cgroup_is_root(mem)) {
rcu_read_unlock();
goto done;
}
- if (consume_stock(mem)) {
+ if (page_size == PAGE_SIZE && consume_stock(mem)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
@@ -1974,7 +2004,7 @@ again:
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
- csize = PAGE_SIZE;
+ csize = page_size;
css_put(&mem->css);
mem = NULL;
goto again;
@@ -1995,8 +2025,8 @@ again:
}
} while (ret != CHARGE_OK);
- if (csize > PAGE_SIZE)
- refill_stock(mem, csize - PAGE_SIZE);
+ if (csize > page_size)
+ refill_stock(mem, csize - page_size);
css_put(&mem->css);
done:
*memcg = mem;
@@ -2024,9 +2054,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
}
}
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+ int page_size)
{
- __mem_cgroup_cancel_charge(mem, 1);
+ __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
}
/*
@@ -2076,15 +2107,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
return mem;
}
-/*
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
- * USED state. If already USED, uncharge and return.
- */
-
static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- enum charge_type ctype)
+ struct page_cgroup *pc,
+ enum charge_type ctype,
+ int page_size)
{
+ int nr_pages = page_size >> PAGE_SHIFT;
+
/* try_charge() can return NULL to *memcg, taking care of it. */
if (!mem)
return;
@@ -2092,10 +2121,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- mem_cgroup_cancel_charge(mem);
+ mem_cgroup_cancel_charge(mem, page_size);
return;
}
-
+ /*
+ * we don't need page_cgroup_lock about tail pages, becase they are not
+ * accessed by any other context at this point.
+ */
pc->mem_cgroup = mem;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2119,8 +2151,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
break;
}
- mem_cgroup_charge_statistics(mem, pc, true);
-
+ mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
unlock_page_cgroup(pc);
/*
* "charge_statistics" updated event counter. Then, check it.
@@ -2130,6 +2161,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
memcg_check_events(mem, pc->page);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
+ (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+ struct page_cgroup *head_pc = lookup_page_cgroup(head);
+ struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+ unsigned long flags;
+
+ if (mem_cgroup_disabled())
+ return;
+ /*
+ * We have no races with charge/uncharge but will have races with
+ * page state accounting.
+ */
+ move_lock_page_cgroup(head_pc, &flags);
+
+ tail_pc->mem_cgroup = head_pc->mem_cgroup;
+ smp_wmb(); /* see __commit_charge() */
+ if (PageCgroupAcctLRU(head_pc)) {
+ enum lru_list lru;
+ struct mem_cgroup_per_zone *mz;
+
+ /*
+ * LRU flags cannot be copied because we need to add tail
+ *.page to LRU by generic call and our hook will be called.
+ * We hold lru_lock, then, reduce counter directly.
+ */
+ lru = page_lru(head);
+ mz = page_cgroup_zoneinfo(head_pc);
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ }
+ tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+ move_unlock_page_cgroup(head_pc, &flags);
+}
+#endif
+
/**
* __mem_cgroup_move_account - move account of the page
* @pc: page_cgroup of the page.
@@ -2148,11 +2221,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
*/
static void __mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+ struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
+ int charge_size)
{
+ int nr_pages = charge_size >> PAGE_SHIFT;
+
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
- VM_BUG_ON(!PageCgroupLocked(pc));
+ VM_BUG_ON(!page_is_cgroup_locked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(pc->mem_cgroup != from);
@@ -2163,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
}
- mem_cgroup_charge_statistics(from, pc, false);
+ mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
- mem_cgroup_cancel_charge(from);
+ mem_cgroup_cancel_charge(from, charge_size);
/* caller should have done css_get */
pc->mem_cgroup = to;
- mem_cgroup_charge_statistics(to, pc, true);
+ mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
@@ -2185,12 +2261,25 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
* __mem_cgroup_move_account()
*/
static int mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+ struct mem_cgroup *from, struct mem_cgroup *to,
+ bool uncharge, int charge_size)
{
int ret = -EINVAL;
+ unsigned long flags;
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
+ return -EBUSY;
+
lock_page_cgroup(pc);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
- __mem_cgroup_move_account(pc, from, to, uncharge);
+ move_lock_page_cgroup(pc, &flags);
+ __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
+ move_unlock_page_cgroup(pc, &flags);
ret = 0;
}
unlock_page_cgroup(pc);
@@ -2214,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
struct cgroup *cg = child->css.cgroup;
struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent;
+ int page_size = PAGE_SIZE;
+ unsigned long flags;
int ret;
/* Is ROOT ? */
@@ -2226,14 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
if (isolate_lru_page(page))
goto put;
+ if (PageTransHuge(page))
+ page_size = HPAGE_SIZE;
+
parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask,
+ &parent, false, page_size);
if (ret || !parent)
goto put_back;
- ret = mem_cgroup_move_account(pc, child, parent, true);
+ if (page_size > PAGE_SIZE)
+ flags = compound_lock_irqsave(page);
+
+ ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
if (ret)
- mem_cgroup_cancel_charge(parent);
+ mem_cgroup_cancel_charge(parent, page_size);
+
+ if (page_size > PAGE_SIZE)
+ compound_unlock_irqrestore(page, flags);
put_back:
putback_lru_page(page);
put:
@@ -2252,20 +2353,32 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype)
{
struct mem_cgroup *mem = NULL;
+ int page_size = PAGE_SIZE;
struct page_cgroup *pc;
+ bool oom = true;
int ret;
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ /*
+ * Never OOM-kill a process for a huge page. The
+ * fault handler will fall back to regular pages.
+ */
+ oom = false;
+ }
+
pc = lookup_page_cgroup(page);
/* can happen at boot */
if (unlikely(!pc))
return 0;
prefetchw(pc);
- ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
if (ret || !mem)
return ret;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
return 0;
}
@@ -2274,8 +2387,6 @@ int mem_cgroup_newpage_charge(struct page *page,
{
if (mem_cgroup_disabled())
return 0;
- if (PageCompound(page))
- return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
@@ -2381,13 +2492,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
if (!mem)
goto charge_cur_mm;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+ ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, ptr, true);
+ return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
}
static void
@@ -2403,7 +2514,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
- __mem_cgroup_commit_charge(ptr, pc, ctype);
+ __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
mem_cgroup_lru_add_after_commit_swapcache(page);
/*
* Now swap is on-memory. This means this page may be
@@ -2452,11 +2563,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
- mem_cgroup_cancel_charge(mem);
+ mem_cgroup_cancel_charge(mem, PAGE_SIZE);
}
static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
+ int page_size)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
@@ -2483,6 +2595,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
+ if (page_size != PAGE_SIZE)
+ goto direct_uncharge;
+
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
@@ -2496,9 +2611,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
batch->memsw_bytes += PAGE_SIZE;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ res_counter_uncharge(&mem->res, page_size);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ res_counter_uncharge(&mem->memsw, page_size);
if (unlikely(batch->memcg != mem))
memcg_oom_recover(mem);
return;
@@ -2510,8 +2625,10 @@ direct_uncharge:
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
+ int count;
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ int page_size = PAGE_SIZE;
if (mem_cgroup_disabled())
return NULL;
@@ -2519,6 +2636,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (PageSwapCache(page))
return NULL;
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
+
+ count = page_size >> PAGE_SHIFT;
/*
* Check if our page_cgroup is valid
*/
@@ -2551,7 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- mem_cgroup_charge_statistics(mem, pc, false);
+ mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
ClearPageCgroupUsed(pc);
/*
@@ -2572,7 +2695,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
mem_cgroup_get(mem);
}
if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
+ __do_uncharge(mem, ctype, page_size);
return mem;
@@ -2767,6 +2890,7 @@ int mem_cgroup_prepare_migration(struct page *page,
enum charge_type ctype;
int ret = 0;
+ VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return 0;
@@ -2816,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page,
return 0;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
css_put(&mem->css);/* drop extra refcnt */
if (ret || *ptr == NULL) {
if (PageAnon(page)) {
@@ -2843,13 +2967,13 @@ int mem_cgroup_prepare_migration(struct page *page,
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
return ret;
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage, struct page *newpage)
+ struct page *oldpage, struct page *newpage, bool migration_ok)
{
struct page *used, *unused;
struct page_cgroup *pc;
@@ -2858,8 +2982,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
return;
/* blocks rmdir() */
cgroup_exclude_rmdir(&mem->css);
- /* at migration success, oldpage->mapping is NULL. */
- if (oldpage->mapping) {
+ if (!migration_ok) {
used = oldpage;
unused = newpage;
} else {
@@ -4169,13 +4292,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
- pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
mem->info.nodeinfo[node] = pn;
- memset(pn, 0, sizeof(*pn));
-
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
@@ -4199,14 +4320,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
- mem = kmalloc(size, GFP_KERNEL);
+ mem = kzalloc(size, GFP_KERNEL);
else
- mem = vmalloc(size);
+ mem = vzalloc(size);
if (!mem)
return NULL;
- memset(mem, 0, size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!mem->stat)
goto out_free;
@@ -4454,7 +4574,8 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+ PAGE_SIZE);
if (ret || !mem)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
@@ -4616,6 +4737,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4653,10 +4775,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
- return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
}
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
@@ -4691,18 +4818,28 @@ static void mem_cgroup_clear_mc(void)
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
-
mc.moved_swap = 0;
}
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+
+ /*
+ * we must clear moving_task before waking up waiters at the end of
+ * task migration.
+ */
+ mc.moving_task = NULL;
+ __mem_cgroup_clear_mc();
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
- mc.moving_task = NULL;
spin_unlock(&mc.lock);
mem_cgroup_end_move(from);
- memcg_oom_recover(from);
- memcg_oom_recover(to);
- wake_up_all(&mc.waitq);
}
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4729,16 +4866,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
- VM_BUG_ON(mc.moving_task);
mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
- mc.precharge = 0;
- mc.moved_charge = 0;
- mc.moved_swap = 0;
- mc.moving_task = current;
spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
@@ -4767,6 +4900,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
spinlock_t *ptl;
retry:
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
@@ -4787,7 +4921,7 @@ retry:
goto put;
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(pc,
- mc.from, mc.to, false)) {
+ mc.from, mc.to, false, PAGE_SIZE)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -4832,7 +4966,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
struct vm_area_struct *vma;
lru_add_drain_all();
- down_read(&mm->mmap_sem);
+retry:
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ /*
+ * Someone who are holding the mmap_sem might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4911,10 +5057,21 @@ struct cgroup_subsys mem_cgroup_subsys = {
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static int __init enable_swap_account(char *s)
+{
+ /* consider enabled if no parameter or 1 is given */
+ if (!(*s) || !strcmp(s, "=1"))
+ really_do_swap_account = 1;
+ else if (!strcmp(s, "=0"))
+ really_do_swap_account = 0;
+ return 1;
+}
+__setup("swapaccount", enable_swap_account);
static int __init disable_swap_account(char *s)
{
- really_do_swap_account = 0;
+ printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+ enable_swap_account("=0");
return 1;
}
__setup("noswapaccount", disable_swap_account);