diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 1483 |
1 files changed, 1304 insertions, 179 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd39ba000b3..09255ec8159 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -10,6 +10,10 @@ * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -59,6 +63,8 @@ #include <trace/events/vmscan.h> struct cgroup_subsys mem_cgroup_subsys __read_mostly; +EXPORT_SYMBOL(mem_cgroup_subsys); + #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -266,6 +272,10 @@ struct mem_cgroup { }; /* + * the counter to account for kernel memory usage. + */ + struct res_counter kmem; + /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. */ @@ -280,6 +290,7 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; + unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; @@ -330,8 +341,61 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif +#if defined(CONFIG_MEMCG_KMEM) + /* analogous to slab_common's slab_caches list. per-memcg */ + struct list_head memcg_slab_caches; + /* Not a spinlock, we can take a lot of time walking the list */ + struct mutex slab_caches_mutex; + /* Index in the kmem_cache->memcg_params->memcg_caches array */ + int kmemcg_id; +#endif }; +/* internal only representation about the status of kmem accounting. */ +enum { + KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ + KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ + KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ +}; + +/* We account when limit is on, but only after call sites are patched */ +#define KMEM_ACCOUNTED_MASK \ + ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) + +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static void memcg_kmem_set_activated(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) +{ + clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); +} + +static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) +{ + if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) + set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) +{ + return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, + &memcg->kmem_account_flags); +} +#endif + /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a @@ -386,9 +450,13 @@ enum charge_type { }; /* for encoding cft->private value on file */ -#define _MEM (0) -#define _MEMSWAP (1) -#define _OOM_TYPE (2) +enum res_type { + _MEM, + _MEMSWAP, + _OOM_TYPE, + _KMEM, +}; + #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) @@ -485,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_MEMCG_KMEM +/* + * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * There are two main reasons for not using the css_id for this: + * 1) this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. + * + * 2) In order not to violate the cgroup API, we would like to do all memory + * allocation in ->create(). At that point, we haven't yet allocated the + * css_id. Having a separate index prevents us from messing with the cgroup + * core for this + * + * The current size of the caches array is stored in + * memcg_limited_groups_array_size. It will double each time we have to + * increase it. + */ +static DEFINE_IDA(kmem_limited_groups); +int memcg_limited_groups_array_size; + +/* + * MIN_SIZE is different than 1, because we would like to avoid going through + * the alloc/free process all the time. In a small machine, 4 kmem-limited + * cgroups is a reasonable guess. In the future, it could be a parameter or + * tunable, but that is strictly not necessary. + * + * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * this constant directly from cgroup, but it is understandable that this is + * better kept as an internal representation in cgroup.c. In any case, the + * css_id space is not getting any smaller, and we don't have to necessarily + * increase ours as well if it increases. + */ +#define MEMCG_CACHES_MIN_SIZE 4 +#define MEMCG_CACHES_MAX_SIZE 65535 + +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ +struct static_key memcg_kmem_enabled_key; +EXPORT_SYMBOL(memcg_kmem_enabled_key); + +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ + if (memcg_kmem_is_active(memcg)) { + static_key_slow_dec(&memcg_kmem_enabled_key); + ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + } + /* + * This check can't live in kmem destruction function, + * since the charges will outlive the cgroup + */ + WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); +} +#else +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static void disarm_static_keys(struct mem_cgroup *memcg) +{ + disarm_sock_keys(memcg); + disarm_kmem_keys(memcg); +} + static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * @@ -800,7 +937,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, int nid; u64 total = 0; - for_each_node_state(nid, N_HIGH_MEMORY) + for_each_node_state(nid, N_MEMORY) total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); return total; } @@ -1015,13 +1152,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; - if (!mm) - return; - rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!memcg)) @@ -1040,7 +1174,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) out: rcu_read_unlock(); } -EXPORT_SYMBOL(mem_cgroup_count_vm_event); +EXPORT_SYMBOL(__mem_cgroup_count_vm_event); /** * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg @@ -1454,6 +1588,10 @@ done: res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); + printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); } /* @@ -1498,8 +1636,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return limit; } -void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order) +static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) { struct mem_cgroup *iter; unsigned long chosen_points = 0; @@ -1644,9 +1782,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) return; /* make a nodemask where this memcg uses memory from */ - memcg->scan_nodes = node_states[N_HIGH_MEMORY]; + memcg->scan_nodes = node_states[N_MEMORY]; - for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { + for_each_node_mask(nid, node_states[N_MEMORY]) { if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) node_clear(nid, memcg->scan_nodes); @@ -1717,7 +1855,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) /* * Check rest of nodes. */ - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { if (node_isset(nid, memcg->scan_nodes)) continue; if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) @@ -2061,20 +2199,28 @@ struct memcg_stock_pcp { static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); -/* - * Try to consume stocked charge on this cpu. If success, one page is consumed - * from local stock and true is returned. If the stock is 0 or charges from a - * cgroup which is not current target, returns false. This stock will be - * refilled. +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock. Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; bool ret = true; + if (nr_pages > CHARGE_BATCH) + return false; + stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages) - stock->nr_pages--; + if (memcg == stock->cached && stock->nr_pages >= nr_pages) + stock->nr_pages -= nr_pages; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -2251,7 +2397,8 @@ enum { }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, bool oom_check) + unsigned int nr_pages, unsigned int min_pages, + bool oom_check) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2274,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch - * of regular pages (CHARGE_BATCH), or a single regular page (1). - * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (nr_pages == CHARGE_BATCH) + if (nr_pages > min_pages) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; + if (gfp_mask & __GFP_NORETRY) + return CHARGE_NOMEM; + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; @@ -2298,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages == 1 && ret) + if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) return CHARGE_RETRY; /* @@ -2370,10 +2517,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; - VM_BUG_ON(css_is_removed(&memcg->css)); if (mem_cgroup_is_root(memcg)) goto done; - if (nr_pages == 1 && consume_stock(memcg)) + if (consume_stock(memcg, nr_pages)) goto done; css_get(&memcg->css); } else { @@ -2398,7 +2544,7 @@ again: rcu_read_unlock(); goto done; } - if (nr_pages == 1 && consume_stock(memcg)) { + if (consume_stock(memcg, nr_pages)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -2433,7 +2579,8 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, + oom_check); switch (ret) { case CHARGE_OK: break; @@ -2510,9 +2657,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, /* * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller must check css_is_removed() or some if - * it's concern. (dropping refcnt from swap can be called against removed - * memcg.) + * rcu_read_lock(). The caller is responsible for calling css_tryget if + * the mem_cgroup is used for charging. (dropping refcnt from swap can be + * called against removed memcg.) */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { @@ -2626,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, memcg_check_events(memcg, page); } +static DEFINE_MUTEX(set_limit_mutex); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) +{ + return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && + (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); +} + +/* + * This is a bit cumbersome, but it is rarely used and avoids a backpointer + * in the memcg_cache_params struct. + */ +static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) +{ + struct kmem_cache *cachep; + + VM_BUG_ON(p->is_root_cache); + cachep = p->root_cache; + return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; +} + +#ifdef CONFIG_SLABINFO +static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, + struct seq_file *m) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct memcg_cache_params *params; + + if (!memcg_can_account_kmem(memcg)) + return -EIO; + + print_slabinfo_header(m); + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) + cache_show(memcg_params_to_cache(params), m); + mutex_unlock(&memcg->slab_caches_mutex); + + return 0; +} +#endif + +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +{ + struct res_counter *fail_res; + struct mem_cgroup *_memcg; + int ret = 0; + bool may_oom; + + ret = res_counter_charge(&memcg->kmem, size, &fail_res); + if (ret) + return ret; + + /* + * Conditions under which we can wait for the oom_killer. Those are + * the same conditions tested by the core page allocator + */ + may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); + + _memcg = memcg; + ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, + &_memcg, may_oom); + + if (ret == -EINTR) { + /* + * __mem_cgroup_try_charge() chosed to bypass to root due to + * OOM kill or fatal signal. Since our only options are to + * either fail the allocation or charge it to this cgroup, do + * it as a temporary condition. But we can't fail. From a + * kmem/slab perspective, the cache has already been selected, + * by mem_cgroup_kmem_get_cache(), so it is too late to change + * our minds. + * + * This condition will only trigger if the task entered + * memcg_charge_kmem in a sane state, but was OOM-killed during + * __mem_cgroup_try_charge() above. Tasks that were already + * dying when the allocation triggers should have been already + * directed to the root cgroup in memcontrol.h + */ + res_counter_charge_nofail(&memcg->res, size, &fail_res); + if (do_swap_account) + res_counter_charge_nofail(&memcg->memsw, size, + &fail_res); + ret = 0; + } else if (ret) + res_counter_uncharge(&memcg->kmem, size); + + return ret; +} + +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +{ + res_counter_uncharge(&memcg->res, size); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); + + /* Not down to 0 */ + if (res_counter_uncharge(&memcg->kmem, size)) + return; + + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); +} + +void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) +{ + if (!memcg) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); + mutex_unlock(&memcg->slab_caches_mutex); +} + +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +int memcg_cache_id(struct mem_cgroup *memcg) +{ + return memcg ? memcg->kmemcg_id : -1; +} + +/* + * This ends up being protected by the set_limit mutex, during normal + * operation, because that is its main call site. + * + * But when we create a new cache, we can call this as well if its parent + * is kmem-limited. That will have to hold set_limit_mutex as well. + */ +int memcg_update_cache_sizes(struct mem_cgroup *memcg) +{ + int num, ret; + + num = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (num < 0) + return num; + /* + * After this point, kmem_accounted (that we test atomically in + * the beginning of this conditional), is no longer 0. This + * guarantees only one process will set the following boolean + * to true. We don't need test_and_set because we're protected + * by the set_limit_mutex anyway. + */ + memcg_kmem_set_activated(memcg); + + ret = memcg_update_all_caches(num+1); + if (ret) { + ida_simple_remove(&kmem_limited_groups, num); + memcg_kmem_clear_activated(memcg); + return ret; + } + + memcg->kmemcg_id = num; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); + mutex_init(&memcg->slab_caches_mutex); + return 0; +} + +static size_t memcg_caches_array_size(int num_groups) +{ + ssize_t size; + if (num_groups <= 0) + return 0; + + size = 2 * num_groups; + if (size < MEMCG_CACHES_MIN_SIZE) + size = MEMCG_CACHES_MIN_SIZE; + else if (size > MEMCG_CACHES_MAX_SIZE) + size = MEMCG_CACHES_MAX_SIZE; + + return size; +} + +/* + * We should update the current array size iff all caches updates succeed. This + * can only be done from the slab side. The slab mutex needs to be held when + * calling this. + */ +void memcg_update_array_size(int num) +{ + if (num > memcg_limited_groups_array_size) + memcg_limited_groups_array_size = memcg_caches_array_size(num); +} + +int memcg_update_cache_size(struct kmem_cache *s, int num_groups) +{ + struct memcg_cache_params *cur_params = s->memcg_params; + + VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + + if (num_groups > memcg_limited_groups_array_size) { + int i; + ssize_t size = memcg_caches_array_size(num_groups); + + size *= sizeof(void *); + size += sizeof(struct memcg_cache_params); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) { + s->memcg_params = cur_params; + return -ENOMEM; + } + + s->memcg_params->is_root_cache = true; + + /* + * There is the chance it will be bigger than + * memcg_limited_groups_array_size, if we failed an allocation + * in a cache, in which case all caches updated before it, will + * have a bigger array. + * + * But if that is the case, the data after + * memcg_limited_groups_array_size is certainly unused + */ + for (i = 0; i < memcg_limited_groups_array_size; i++) { + if (!cur_params->memcg_caches[i]) + continue; + s->memcg_params->memcg_caches[i] = + cur_params->memcg_caches[i]; + } + + /* + * Ideally, we would wait until all caches succeed, and only + * then free the old one. But this is not worth the extra + * pointer per-cache we'd have to have for this. + * + * It is not a big deal if some caches are left with a size + * bigger than the others. And all updates will reset this + * anyway. + */ + kfree(cur_params); + } + return 0; +} + +int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) +{ + size_t size = sizeof(struct memcg_cache_params); + + if (!memcg_kmem_enabled()) + return 0; + + if (!memcg) + size += memcg_limited_groups_array_size * sizeof(void *); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) + return -ENOMEM; + + if (memcg) { + s->memcg_params->memcg = memcg; + s->memcg_params->root_cache = root_cache; + } + return 0; +} + +void memcg_release_cache(struct kmem_cache *s) +{ + struct kmem_cache *root; + struct mem_cgroup *memcg; + int id; + + /* + * This happens, for instance, when a root cache goes away before we + * add any memcg. + */ + if (!s->memcg_params) + return; + + if (s->memcg_params->is_root_cache) + goto out; + + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); + + root = s->memcg_params->root_cache; + root->memcg_params->memcg_caches[id] = NULL; + mem_cgroup_put(memcg); + + mutex_lock(&memcg->slab_caches_mutex); + list_del(&s->memcg_params->list); + mutex_unlock(&memcg->slab_caches_mutex); + +out: + kfree(s->memcg_params); +} + +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account--; +} + +static void kmem_cache_destroy_work_func(struct work_struct *w) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *p; + + p = container_of(w, struct memcg_cache_params, destroy); + + cachep = memcg_params_to_cache(p); + + /* + * If we get down to 0 after shrink, we could delete right away. + * However, memcg_release_pages() already puts us back in the workqueue + * in that case. If we proceed deleting, we'll get a dangling + * reference, and removing the object from the workqueue in that case + * is unnecessary complication. We are not a fast path. + * + * Note that this case is fundamentally different from racing with + * shrink_slab(): if memcg_cgroup_destroy_cache() is called in + * kmem_cache_shrink, not only we would be reinserting a dead cache + * into the queue, but doing so from inside the worker racing to + * destroy it. + * + * So if we aren't down to zero, we'll just schedule a worker and try + * again + */ + if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + return; + } else + kmem_cache_destroy(cachep); +} + +void mem_cgroup_destroy_cache(struct kmem_cache *cachep) +{ + if (!cachep->memcg_params->dead) + return; + + /* + * There are many ways in which we can get here. + * + * We can get to a memory-pressure situation while the delayed work is + * still pending to run. The vmscan shrinkers can then release all + * cache memory and get us to destruction. If this is the case, we'll + * be executed twice, which is a bug (the second time will execute over + * bogus data). In this case, cancelling the work should be fine. + * + * But we can also get here from the worker itself, if + * kmem_cache_shrink is enough to shake all the remaining objects and + * get the page count to 0. In this case, we'll deadlock if we try to + * cancel the work (the worker runs with an internal lock held, which + * is the same lock we would hold for cancel_work_sync().) + * + * Since we can't possibly know who got us here, just refrain from + * running if there is already work pending + */ + if (work_pending(&cachep->memcg_params->destroy)) + return; + /* + * We have to defer the actual destroying to a workqueue, because + * we might currently be in a context that cannot sleep. + */ + schedule_work(&cachep->memcg_params->destroy); +} + +static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) +{ + char *name; + struct dentry *dentry; + + rcu_read_lock(); + dentry = rcu_dereference(memcg->css.cgroup->dentry); + rcu_read_unlock(); + + BUG_ON(dentry == NULL); + + name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), dentry->d_name.name); + + return name; +} + +static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, + struct kmem_cache *s) +{ + char *name; + struct kmem_cache *new; + + name = memcg_cache_name(memcg, s); + if (!name) + return NULL; + + new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, + (s->flags & ~SLAB_PANIC), s->ctor, s); + + if (new) + new->allocflags |= __GFP_KMEMCG; + + kfree(name); + return new; +} + +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct kmem_cache *new_cachep; + int idx; + + BUG_ON(!memcg_can_account_kmem(memcg)); + + idx = memcg_cache_id(memcg); + + mutex_lock(&memcg_cache_mutex); + new_cachep = cachep->memcg_params->memcg_caches[idx]; + if (new_cachep) + goto out; + + new_cachep = kmem_cache_dup(memcg, cachep); + if (new_cachep == NULL) { + new_cachep = cachep; + goto out; + } + + mem_cgroup_get(memcg); + atomic_set(&new_cachep->memcg_params->nr_pages , 0); + + cachep->memcg_params->memcg_caches[idx] = new_cachep; + /* + * the readers won't lock, make sure everybody sees the updated value, + * so they won't put stuff in the queue again for no reason + */ + wmb(); +out: + mutex_unlock(&memcg_cache_mutex); + return new_cachep; +} + +void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + struct kmem_cache *c; + int i; + + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + return; + + /* + * If the cache is being destroyed, we trust that there is no one else + * requesting objects from it. Even if there are, the sanity checks in + * kmem_cache_destroy should caught this ill-case. + * + * Still, we don't want anyone else freeing memcg_caches under our + * noses, which can happen if a new memcg comes to life. As usual, + * we'll take the set_limit_mutex to protect ourselves against this. + */ + mutex_lock(&set_limit_mutex); + for (i = 0; i < memcg_limited_groups_array_size; i++) { + c = s->memcg_params->memcg_caches[i]; + if (!c) + continue; + + /* + * We will now manually delete the caches, so to avoid races + * we need to cancel all pending destruction workers and + * proceed with destruction ourselves. + * + * kmem_cache_destroy() will call kmem_cache_shrink internally, + * and that could spawn the workers again: it is likely that + * the cache still have active pages until this very moment. + * This would lead us back to mem_cgroup_destroy_cache. + * + * But that will not execute at all if the "dead" flag is not + * set, so flip it down to guarantee we are in control. + */ + c->memcg_params->dead = false; + cancel_work_sync(&c->memcg_params->destroy); + kmem_cache_destroy(c); + } + mutex_unlock(&set_limit_mutex); +} + +struct create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *params; + + if (!memcg_kmem_is_active(memcg)) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + cachep = memcg_params_to_cache(params); + cachep->memcg_params->dead = true; + INIT_WORK(&cachep->memcg_params->destroy, + kmem_cache_destroy_work_func); + schedule_work(&cachep->memcg_params->destroy); + } + mutex_unlock(&memcg->slab_caches_mutex); +} + +static void memcg_create_cache_work_func(struct work_struct *w) +{ + struct create_work *cw; + + cw = container_of(w, struct create_work, work); + memcg_create_kmem_cache(cw->memcg, cw->cachep); + /* Drop the reference gotten when we enqueued. */ + css_put(&cw->memcg->css); + kfree(cw); +} + +/* + * Enqueue the creation of a per-memcg kmem_cache. + * Called with rcu_read_lock. + */ +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct create_work *cw; + + cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + if (cw == NULL) + return; + + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) { + kfree(cw); + return; + } + + cw->memcg = memcg; + cw->cachep = cachep; + + INIT_WORK(&cw->work, memcg_create_cache_work_func); + schedule_work(&cw->work); +} + +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_create_cache_enqueue will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + memcg_stop_kmem_account(); + __memcg_create_cache_enqueue(memcg, cachep); + memcg_resume_kmem_account(); +} +/* + * Return the kmem_cache we're supposed to use for a slab allocation. + * We try to use the current memcg's version of the cache. + * + * If the cache does not exist yet, if we are the first user of it, + * we either create it immediately, if possible, or create it asynchronously + * in a workqueue. + * In the latter case, we will let the current allocation go through with + * the original cache. + * + * Can't be called in interrupt context or from kernel threads. + * This function needs to be called with rcu_read_lock() held. + */ +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, + gfp_t gfp) +{ + struct mem_cgroup *memcg; + int idx; + + VM_BUG_ON(!cachep->memcg_params); + VM_BUG_ON(!cachep->memcg_params->is_root_cache); + + if (!current->mm || current->memcg_kmem_skip_account) + return cachep; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); + rcu_read_unlock(); + + if (!memcg_can_account_kmem(memcg)) + return cachep; + + idx = memcg_cache_id(memcg); + + /* + * barrier to mare sure we're always seeing the up to date value. The + * code updating memcg_caches will issue a write barrier to match this. + */ + read_barrier_depends(); + if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; + } + + return cachep->memcg_params->memcg_caches[idx]; +} +EXPORT_SYMBOL(__memcg_kmem_get_cache); + +/* + * We need to verify if the allocation against current->mm->owner's memcg is + * possible for the given order. But the page is not allocated yet, so we'll + * need a further commit step to do the final arrangements. + * + * It is possible for the task to switch cgroups in this mean time, so at + * commit time, we can't rely on task conversion any longer. We'll then use + * the handle argument to return to the caller which cgroup we should commit + * against. We could also return the memcg directly and avoid the pointer + * passing, but a boolean return value gives better semantics considering + * the compiled-out case as well. + * + * Returning true means the allocation is possible. + */ +bool +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +{ + struct mem_cgroup *memcg; + int ret; + + *_memcg = NULL; + memcg = try_get_mem_cgroup_from_mm(current->mm); + + /* + * very rare case described in mem_cgroup_from_task. Unfortunately there + * isn't much we can do without complicating this too much, and it would + * be gfp-dependent anyway. Just let it go + */ + if (unlikely(!memcg)) + return true; + + if (!memcg_can_account_kmem(memcg)) { + css_put(&memcg->css); + return true; + } + + ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + if (!ret) + *_memcg = memcg; + + css_put(&memcg->css); + return (ret == 0); +} + +void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, + int order) +{ + struct page_cgroup *pc; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + + /* The page allocation failed. Revert */ + if (!page) { + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + return; + } + + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + pc->mem_cgroup = memcg; + SetPageCgroupUsed(pc); + unlock_page_cgroup(pc); +} + +void __memcg_kmem_uncharge_pages(struct page *page, int order) +{ + struct mem_cgroup *memcg = NULL; + struct page_cgroup *pc; + + + pc = lookup_page_cgroup(page); + /* + * Fast unlocked return. Theoretically might have changed, have to + * check again after locking. + */ + if (!PageCgroupUsed(pc)) + return; + + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + ClearPageCgroupUsed(pc); + } + unlock_page_cgroup(pc); + + /* + * We trust that only if there is a memcg associated with the page, it + * is a valid allocation + */ + if (!memcg) + return; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); +} +#else +static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) @@ -2709,13 +3616,6 @@ static int mem_cgroup_move_account(struct page *page, /* caller should have done css_get */ pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, anon, nr_pages); - /* - * We charges against "to" which may not have any tasks. Then, "to" - * can be under rmdir(). But in current implementation, caller of - * this function is just force_empty() and move charge, so it's - * guaranteed that "to" is never removed. So, we don't check rmdir - * status here. - */ move_unlock_mem_cgroup(from, &flags); ret = 0; unlock: @@ -2729,10 +3629,27 @@ out: return ret; } -/* - * move charges to its parent. +/** + * mem_cgroup_move_parent - moves page to the parent group + * @page: the page to move + * @pc: page_cgroup of the page + * @child: page's cgroup + * + * move charges to its parent or the root cgroup if the group has no + * parent (aka use_hierarchy==0). + * Although this might fail (get_page_unless_zero, isolate_lru_page or + * mem_cgroup_move_account fails) the failure is always temporary and + * it signals a race with a page removal/uncharge or migration. In the + * first case the page is on the way out and it will vanish from the LRU + * on the next attempt and the call should be retried later. + * Isolation from the LRU fails only if page has been isolated from + * the LRU since we looked at it and that usually means either global + * reclaim or migration going on. The page will either get back to the + * LRU or vanish. + * Finaly mem_cgroup_move_account fails only if the page got uncharged + * (!PageCgroupUsed) or moved to a different group. The page will + * disappear in the next attempt. */ - static int mem_cgroup_move_parent(struct page *page, struct page_cgroup *pc, struct mem_cgroup *child) @@ -2742,9 +3659,7 @@ static int mem_cgroup_move_parent(struct page *page, unsigned long uninitialized_var(flags); int ret; - /* Is ROOT ? */ - if (mem_cgroup_is_root(child)) - return -EINVAL; + VM_BUG_ON(mem_cgroup_is_root(child)); ret = -EBUSY; if (!get_page_unless_zero(page)) @@ -2761,8 +3676,10 @@ static int mem_cgroup_move_parent(struct page *page, if (!parent) parent = root_mem_cgroup; - if (nr_pages > 1) + if (nr_pages > 1) { + VM_BUG_ON(!PageTransHuge(page)); flags = compound_lock_irqsave(page); + } ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent); @@ -2904,7 +3821,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, return; if (!memcg) return; - cgroup_exclude_rmdir(&memcg->css); __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); /* @@ -2918,12 +3834,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, swp_entry_t ent = {.val = page_private(page)}; mem_cgroup_uncharge_swap(ent); } - /* - * At swapin, we may charge account against cgroup which has no tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } void mem_cgroup_commit_charge_swapin(struct page *page, @@ -3288,15 +4198,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; struct page_cgroup *pc; enum charge_type ctype; *memcgp = NULL; - VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return; + if (PageTransHuge(page)) + nr_pages <<= compound_order(page); + pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { @@ -3358,7 +4271,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, * charged to the res_counter since we plan on replacing the * old one and only one page is going to be left afterwards. */ - __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); + __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); } /* remove redundant charge if migration failed*/ @@ -3371,8 +4284,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, if (!memcg) return; - /* blocks rmdir() */ - cgroup_exclude_rmdir(&memcg->css); + if (!migration_ok) { used = oldpage; unused = newpage; @@ -3406,13 +4318,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, */ if (anon) mem_cgroup_uncharge_page(used); - /* - * At migration, we may charge account against cgroup which has no - * tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } /* @@ -3490,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page) } #endif -static DEFINE_MUTEX(set_limit_mutex); - static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { @@ -3712,17 +4615,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, return nr_reclaimed; } -/* +/** + * mem_cgroup_force_empty_list - clears LRU of a group + * @memcg: group to clear + * @node: NUMA node + * @zid: zone id + * @lru: lru to to clear + * * Traverse a specified page_cgroup list and try to drop them all. This doesn't - * reclaim the pages page themselves - it just removes the page_cgroups. - * Returns true if some page_cgroups were not freed, indicating that the caller - * must retry this operation. + * reclaim the pages page themselves - pages are moved to the parent (or root) + * group. */ -static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, +static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { struct lruvec *lruvec; - unsigned long flags, loop; + unsigned long flags; struct list_head *list; struct page *busy; struct zone *zone; @@ -3731,11 +4639,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, lruvec = mem_cgroup_zone_lruvec(zone, memcg); list = &lruvec->lists[lru]; - loop = mem_cgroup_get_lru_size(lruvec, lru); - /* give some margin against EBUSY etc...*/ - loop += 256; busy = NULL; - while (loop--) { + do { struct page_cgroup *pc; struct page *page; @@ -3761,76 +4666,80 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, cond_resched(); } else busy = NULL; - } - return !list_empty(list); + } while (!list_empty(list)); } /* - * make mem_cgroup's charge to be 0 if there is no task. + * make mem_cgroup's charge to be 0 if there is no task by moving + * all the charges and pages to the parent. * This enables deleting this mem_cgroup. + * + * Caller is responsible for holding css reference on the memcg. */ -static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) +static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { - int ret; - int node, zid, shrink; - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct cgroup *cgrp = memcg->css.cgroup; - - css_get(&memcg->css); + int node, zid; + u64 usage; - shrink = 0; - /* should free all ? */ - if (free_all) - goto try_to_free; -move_account: do { - ret = -EBUSY; - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) - goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); drain_all_stock_sync(memcg); - ret = 0; mem_cgroup_start_move(memcg); - for_each_node_state(node, N_HIGH_MEMORY) { - for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { + for_each_node_state(node, N_MEMORY) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { enum lru_list lru; for_each_lru(lru) { - ret = mem_cgroup_force_empty_list(memcg, + mem_cgroup_force_empty_list(memcg, node, zid, lru); - if (ret) - break; } } - if (ret) - break; } mem_cgroup_end_move(memcg); memcg_oom_recover(memcg); cond_resched(); - /* "ret" should also be checked to ensure all lists are empty. */ - } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); -out: - css_put(&memcg->css); - return ret; -try_to_free: + /* + * Kernel memory may not necessarily be trackable to a specific + * process. So they are not migrated, and therefore we can't + * expect their value to drop to 0 here. + * Having res filled up with kmem only is enough. + * + * This is a safety check because mem_cgroup_force_empty_list + * could have raced with mem_cgroup_replace_page_cache callers + * so the lru seemed empty but the page could have been added + * right after the check. RES_USAGE should be safe as we always + * charge before adding to the LRU. + */ + usage = res_counter_read_u64(&memcg->res, RES_USAGE) - + res_counter_read_u64(&memcg->kmem, RES_USAGE); + } while (usage > 0); +} + +/* + * Reclaims as many pages from the given memcg as possible and moves + * the rest to the parent. + * + * Caller is responsible for holding css reference for memcg. + */ +static int mem_cgroup_force_empty(struct mem_cgroup *memcg) +{ + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct cgroup *cgrp = memcg->css.cgroup; + /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { - ret = -EBUSY; - goto out; - } + if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) + return -EBUSY; + /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); /* try to free all pages in this cgroup */ - shrink = 1; while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { int progress; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + if (signal_pending(current)) + return -EINTR; + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, false); if (!progress) { @@ -3841,13 +4750,23 @@ try_to_free: } lru_add_drain(); - /* try move_account...there may be some *locked* pages. */ - goto move_account; + mem_cgroup_reparent_charges(memcg); + + return 0; } static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) { - return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + int ret; + + if (mem_cgroup_is_root(memcg)) + return -EINVAL; + css_get(&memcg->css); + ret = mem_cgroup_force_empty(memcg); + css_put(&memcg->css); + + return ret; } @@ -3938,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); char str[64]; u64 val; - int type, name, len; + int name, len; + enum res_type type; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); @@ -3959,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, else val = res_counter_read_u64(&memcg->memsw, name); break; + case _KMEM: + val = res_counter_read_u64(&memcg->kmem, name); + break; default: BUG(); } @@ -3966,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); return simple_read_from_buffer(buf, nbytes, ppos, str, len); } + +static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +{ + int ret = -EINVAL; +#ifdef CONFIG_MEMCG_KMEM + bool must_inc_static_branch = false; + + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + /* + * For simplicity, we won't allow this to be disabled. It also can't + * be changed if the cgroup has children already, or if tasks had + * already joined. + * + * If tasks join before we set the limit, a person looking at + * kmem.usage_in_bytes will have no way to determine when it took + * place, which makes the value quite meaningless. + * + * After it first became limited, changes in the value of the limit are + * of course permitted. + * + * Taking the cgroup_lock is really offensive, but it is so far the only + * way to guarantee that no children will appear. There are plenty of + * other offenders, and they should all go away. Fine grained locking + * is probably the way to go here. When we are fully hierarchical, we + * can also get rid of the use_hierarchy check. + */ + cgroup_lock(); + mutex_lock(&set_limit_mutex); + if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { + if (cgroup_task_count(cont) || (memcg->use_hierarchy && + !list_empty(&cont->children))) { + ret = -EBUSY; + goto out; + } + ret = res_counter_set_limit(&memcg->kmem, val); + VM_BUG_ON(ret); + + ret = memcg_update_cache_sizes(memcg); + if (ret) { + res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); + goto out; + } + must_inc_static_branch = true; + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes, so it is unfeasible to migrate them away. We + * need to reference count the memcg because of that. + */ + mem_cgroup_get(memcg); + } else + ret = res_counter_set_limit(&memcg->kmem, val); +out: + mutex_unlock(&set_limit_mutex); + cgroup_unlock(); + + /* + * We are by now familiar with the fact that we can't inc the static + * branch inside cgroup_lock. See disarm functions for details. A + * worker here is overkill, but also wrong: After the limit is set, we + * must start accounting right away. Since this operation can't fail, + * we can safely defer it to here - no rollback will be needed. + * + * The boolean used to control this is also safe, because + * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be + * able to set it to true; + */ + if (must_inc_static_branch) { + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * setting the active bit after the inc will guarantee no one + * starts accounting before all call sites are patched + */ + memcg_kmem_set_active(memcg); + } + +#endif + return ret; +} + +static int memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + int ret = 0; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + if (!parent) + goto out; + + memcg->kmem_account_flags = parent->kmem_account_flags; +#ifdef CONFIG_MEMCG_KMEM + /* + * When that happen, we need to disable the static branch only on those + * memcgs that enabled it. To achieve this, we would be forced to + * complicate the code by keeping track of which memcgs were the ones + * that actually enabled limits, and which ones got it from its + * parents. + * + * It is a lot simpler just to do static_key_slow_inc() on every child + * that is accounted. + */ + if (!memcg_kmem_is_active(memcg)) + goto out; + + /* + * destroy(), called if we fail, will issue static_key_slow_inc() and + * mem_cgroup_put() if kmem is enabled. We have to either call them + * unconditionally, or clear the KMEM_ACTIVE flag. I personally find + * this more consistent, since it always leads to the same destroy path + */ + mem_cgroup_get(memcg); + static_key_slow_inc(&memcg_kmem_enabled_key); + + mutex_lock(&set_limit_mutex); + ret = memcg_update_cache_sizes(memcg); + mutex_unlock(&set_limit_mutex); +#endif +out: + return ret; +} + /* * The user of this function is... * RES_LIMIT. @@ -3974,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + enum res_type type; + int name; unsigned long long val; int ret; @@ -3996,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, break; if (type == _MEM) ret = mem_cgroup_resize_limit(memcg, val); - else + else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); + else if (type == _KMEM) + ret = memcg_update_kmem_limit(cont, val); + else + return -EINVAL; break; case RES_SOFT_LIMIT: ret = res_counter_memparse_write_strategy(buffer, &val); @@ -4050,7 +5097,8 @@ out: static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + int name; + enum res_type type; type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); @@ -4062,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) case RES_MAX_USAGE: if (type == _MEM) res_counter_reset_max(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_max(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_max(&memcg->kmem); + else + return -EINVAL; break; case RES_FAILCNT: if (type == _MEM) res_counter_reset_failcnt(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_failcnt(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_failcnt(&memcg->kmem); + else + return -EINVAL; break; } @@ -4120,7 +5176,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); seq_printf(m, "total=%lu", total_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); seq_printf(m, " N%d=%lu", nid, node_nr); } @@ -4128,7 +5184,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); seq_printf(m, "file=%lu", file_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE); seq_printf(m, " N%d=%lu", nid, node_nr); @@ -4137,7 +5193,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); seq_printf(m, "anon=%lu", anon_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON); seq_printf(m, " N%d=%lu", nid, node_nr); @@ -4146,7 +5202,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); seq_printf(m, "unevictable=%lu", unevictable_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, BIT(LRU_UNEVICTABLE)); seq_printf(m, " N%d=%lu", nid, node_nr); @@ -4386,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -4469,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -4547,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *event; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); @@ -4572,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *ev, *tmp; - int type = MEMFILE_TYPE(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); BUG_ON(type != _OOM_TYPE); @@ -4631,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, #ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + int ret; + + memcg->kmemcg_id = -1; + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; + return mem_cgroup_sockets_init(memcg, ss); }; static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); + + memcg_kmem_mark_dead(memcg); + + if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) + return; + + /* + * Charges already down to 0, undo mem_cgroup_get() done in the charge + * path here, being careful not to race with memcg_uncharge_kmem: it is + * possible that the charges went down to 0 between mark_dead and the + * res_counter read, so in that case, we don't need the put + */ + if (memcg_kmem_test_and_clear_dead(memcg)) + mem_cgroup_put(memcg); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -4745,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = { .read = mem_cgroup_read, }, #endif +#ifdef CONFIG_MEMCG_KMEM + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write_string = mem_cgroup_write, + .read = mem_cgroup_read, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read = mem_cgroup_read, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read = mem_cgroup_read, + }, +#ifdef CONFIG_SLABINFO + { + .name = "kmem.slabinfo", + .read_seq_string = mem_cgroup_slabinfo_read, + }, +#endif +#endif { }, /* terminate */ }; @@ -4812,16 +5920,29 @@ out_free: } /* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. + * At destroying mem_cgroup, references from swap_cgroup can remain. + * (scanning all at force_empty is too costly...) + * + * Instead of clearing all references at force_empty, we remember + * the number of reference from swap_cgroup and free mem_cgroup when + * it goes down to 0. + * + * Removal of cgroup itself succeeds regardless of refs from swap. */ -static void free_work(struct work_struct *work) + +static void __mem_cgroup_free(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; + int node; int size = sizeof(struct mem_cgroup); - memcg = container_of(work, struct mem_cgroup, work_freeing); + mem_cgroup_remove_from_trees(memcg); + free_css_id(&mem_cgroup_subsys, &memcg->css); + + for_each_node(node) + free_mem_cgroup_per_zone_info(memcg, node); + + free_percpu(memcg->stat); + /* * We need to make sure that (at least for now), the jump label * destruction code runs outside of the cgroup lock. This is because @@ -4833,45 +5954,34 @@ static void free_work(struct work_struct *work) * to move this code around, and make sure it is outside * the cgroup_lock. */ - disarm_sock_keys(memcg); + disarm_static_keys(memcg); if (size < PAGE_SIZE) kfree(memcg); else vfree(memcg); } -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} /* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, + * but in process context. The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw. */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void free_work(struct work_struct *work) { - int node; + struct mem_cgroup *memcg; - mem_cgroup_remove_from_trees(memcg); - free_css_id(&mem_cgroup_subsys, &memcg->css); + memcg = container_of(work, struct mem_cgroup, work_freeing); + __mem_cgroup_free(memcg); +} - for_each_node(node) - free_mem_cgroup_per_zone_info(memcg, node); +static void free_rcu(struct rcu_head *rcu_head) +{ + struct mem_cgroup *memcg; - free_percpu(memcg->stat); - call_rcu(&memcg->rcu_freeing, free_rcu); + memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); + INIT_WORK(&memcg->work_freeing, free_work); + schedule_work(&memcg->work_freeing); } static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -4883,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { if (atomic_sub_and_test(count, &memcg->refcnt)) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); - __mem_cgroup_free(memcg); + call_rcu(&memcg->rcu_freeing, free_rcu); if (parent) mem_cgroup_put(parent); } @@ -4953,7 +6063,7 @@ err_cleanup: } static struct cgroup_subsys_state * __ref -mem_cgroup_create(struct cgroup *cont) +mem_cgroup_css_alloc(struct cgroup *cont) { struct mem_cgroup *memcg, *parent; long error = -ENOMEM; @@ -4980,7 +6090,6 @@ mem_cgroup_create(struct cgroup *cont) &per_cpu(memcg_stock, cpu); INIT_WORK(&stock->work, drain_local_stock); } - hotcpu_notifier(memcg_cpu_hotplug_callback, 0); } else { parent = mem_cgroup_from_cont(cont->parent); memcg->use_hierarchy = parent->use_hierarchy; @@ -4990,6 +6099,8 @@ mem_cgroup_create(struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&memcg->res, &parent->res); res_counter_init(&memcg->memsw, &parent->memsw); + res_counter_init(&memcg->kmem, &parent->kmem); + /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -5000,6 +6111,7 @@ mem_cgroup_create(struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -5034,14 +6146,15 @@ free_out: return ERR_PTR(error); } -static int mem_cgroup_pre_destroy(struct cgroup *cont) +static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - return mem_cgroup_force_empty(memcg, false); + mem_cgroup_reparent_charges(memcg); + mem_cgroup_destroy_all_caches(memcg); } -static void mem_cgroup_destroy(struct cgroup *cont) +static void mem_cgroup_css_free(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); @@ -5631,18 +6744,30 @@ static void mem_cgroup_move_task(struct cgroup *cont, struct cgroup_subsys mem_cgroup_subsys = { .name = "memory", .subsys_id = mem_cgroup_subsys_id, - .create = mem_cgroup_create, - .pre_destroy = mem_cgroup_pre_destroy, - .destroy = mem_cgroup_destroy, + .css_alloc = mem_cgroup_css_alloc, + .css_offline = mem_cgroup_css_offline, + .css_free = mem_cgroup_css_free, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .base_cftypes = mem_cgroup_files, .early_init = 0, .use_id = 1, - .__DEPRECATED_clear_css_refs = true, }; +/* + * The rest of init is performed during ->css_alloc() for root css which + * happens before initcalls. hotcpu_notifier() can't be done together as + * it would introduce circular locking by adding cgroup_lock -> cpu hotplug + * dependency. Do it from a subsys_initcall(). + */ +static int __init mem_cgroup_init(void) +{ + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); + return 0; +} +subsys_initcall(mem_cgroup_init); + #ifdef CONFIG_MEMCG_SWAP static int __init enable_swap_account(char *s) { |