summaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c1936
1 files changed, 1238 insertions, 698 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce1302..5b6b0039f72 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,24 +39,28 @@
#include <linux/limits.h>
#include <linux/export.h>
#include <linux/mutex.h>
+#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
+#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/lockdep.h>
+#include <linux/file.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp_memcontrol.h>
+#include "slab.h"
#include <asm/uaccess.h>
@@ -145,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
* matches memcg->dead_count of the hierarchy root group.
*/
struct mem_cgroup *last_visited;
- unsigned long last_dead_count;
+ int last_dead_count;
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -160,6 +164,10 @@ struct mem_cgroup_per_zone {
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+ struct rb_node tree_node; /* RB tree node */
+ unsigned long long usage_in_excess;/* Set to the value by which */
+ /* the soft limit is exceeded*/
+ bool on_tree;
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
};
@@ -168,6 +176,26 @@ struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
+/*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+
+struct mem_cgroup_tree_per_zone {
+ struct rb_root rb_root;
+ spinlock_t lock;
+};
+
+struct mem_cgroup_tree_per_node {
+ struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_tree {
+ struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
u64 threshold;
@@ -200,6 +228,46 @@ struct mem_cgroup_eventfd_list {
struct eventfd_ctx *eventfd;
};
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+ /*
+ * memcg which the event belongs to.
+ */
+ struct mem_cgroup *memcg;
+ /*
+ * eventfd to signal userspace about the event.
+ */
+ struct eventfd_ctx *eventfd;
+ /*
+ * Each of these stored in a list by the cgroup.
+ */
+ struct list_head list;
+ /*
+ * register_event() callback will be used to add new userspace
+ * waiter for changes related to this event. Use eventfd_signal()
+ * on eventfd to send notification to userspace.
+ */
+ int (*register_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args);
+ /*
+ * unregister_event() callback will be called when userspace closes
+ * the eventfd or on cgroup removing. This callback must be set,
+ * if you want provide notification functionality.
+ */
+ void (*unregister_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd);
+ /*
+ * All fields below needed to unregister event when
+ * userspace closes eventfd.
+ */
+ poll_table pt;
+ wait_queue_head_t *wqh;
+ wait_queue_t wait;
+ struct work_struct remove;
+};
+
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
@@ -286,7 +354,7 @@ struct mem_cgroup {
atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct tcp_memcontrol tcp_mem;
+ struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
/* analogous to slab_common's slab_caches list. per-memcg */
@@ -303,44 +371,21 @@ struct mem_cgroup {
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
- /*
- * Protects soft_contributed transitions.
- * See mem_cgroup_update_soft_limit
- */
- spinlock_t soft_lock;
- /*
- * If true then this group has increased parents' children_in_excess
- * when it got over the soft limit.
- * When a group falls bellow the soft limit, parents' children_in_excess
- * is decreased and soft_contributed changed to false.
- */
- bool soft_contributed;
-
- /* Number of children that are in soft limit excess */
- atomic_t children_in_excess;
+ /* List of events which userspace want to receive */
+ struct list_head event_list;
+ spinlock_t event_list_lock;
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
-static size_t memcg_size(void)
-{
- return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node);
-}
-
/* internal only representation about the status of kmem accounting. */
enum {
- KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
- KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+ KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
- ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
@@ -352,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
- set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
- clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
/*
@@ -422,6 +457,7 @@ static bool move_file(void)
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
+#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -478,14 +514,32 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
- return &mem_cgroup_from_css(css)->vmpressure;
+ return (memcg == root_mem_cgroup);
}
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+/*
+ * We restrict the id in the range of [1, 65535], so it can fit into
+ * an unsigned short.
+ */
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
- return (memcg == root_mem_cgroup);
+ /*
+ * The ID of the root cgroup is 0, but memcg treat 0 as an
+ * invalid ID, so we return (cgroup_id + 1).
+ */
+ return memcg->css.cgroup->id + 1;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id - 1, &mem_cgroup_subsys);
+ return mem_cgroup_from_css(css);
}
/* Writing them here to avoid exposing memcg's inner layout */
@@ -540,13 +594,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
- return &memcg->tcp_mem.cg_proto;
+ return &memcg->tcp_mem;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
- if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+ if (!memcg_proto_activated(&memcg->tcp_mem))
return;
static_key_slow_dec(&memcg_socket_limit_enabled);
}
@@ -559,16 +613,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
- * There are two main reasons for not using the css_id for this:
- * 1) this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
- *
- * 2) In order not to violate the cgroup API, we would like to do all memory
- * allocation in ->create(). At that point, we haven't yet allocated the
- * css_id. Having a separate index prevents us from messing with the cgroup
- * core for this
+ * The main reason for not using cgroup id for this:
+ * this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
*
* The current size of the caches array is stored in
* memcg_limited_groups_array_size. It will double each time we have to
@@ -583,14 +632,14 @@ int memcg_limited_groups_array_size;
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
- * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
- * css_id space is not getting any smaller, and we don't have to necessarily
+ * cgrp_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
-#define MEMCG_CACHES_MAX_SIZE 65535
+#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
/*
* A lot of the calls to the cache allocation functions are expected to be
@@ -648,6 +697,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
return mem_cgroup_zoneinfo(memcg, nid, zid);
}
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_node_zone(int nid, int zid)
+{
+ return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_from_page(struct page *page)
+{
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
+
+ return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static void
+__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long long new_usage_in_excess)
+{
+ struct rb_node **p = &mctz->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct mem_cgroup_per_zone *mz_node;
+
+ if (mz->on_tree)
+ return;
+
+ mz->usage_in_excess = new_usage_in_excess;
+ if (!mz->usage_in_excess)
+ return;
+ while (*p) {
+ parent = *p;
+ mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+ tree_node);
+ if (mz->usage_in_excess < mz_node->usage_in_excess)
+ p = &(*p)->rb_left;
+ /*
+ * We can't avoid mem cgroups that are over their soft
+ * limit by the same amount
+ */
+ else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+ p = &(*p)->rb_right;
+ }
+ rb_link_node(&mz->tree_node, parent, p);
+ rb_insert_color(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = true;
+}
+
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
+{
+ if (!mz->on_tree)
+ return;
+ rb_erase(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = false;
+}
+
+static void
+mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
+{
+ spin_lock(&mctz->lock);
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ spin_unlock(&mctz->lock);
+}
+
+
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+{
+ unsigned long long excess;
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_tree_per_zone *mctz;
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
+ mctz = soft_limit_tree_from_page(page);
+
+ /*
+ * Necessary to update all ancestors when hierarchy is used.
+ * because their event counter is not touched.
+ */
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ excess = res_counter_soft_limit_excess(&memcg->res);
+ /*
+ * We have to update the tree if mz is on RB-tree or
+ * mem is over its softlimit.
+ */
+ if (excess || mz->on_tree) {
+ spin_lock(&mctz->lock);
+ /* if on-tree, remove it */
+ if (mz->on_tree)
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ /*
+ * Insert again. mz->usage_in_excess will be updated.
+ * If excess is 0, no tree ops.
+ */
+ __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
+ spin_unlock(&mctz->lock);
+ }
+ }
+}
+
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
+{
+ int node, zone;
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_tree_per_zone *mctz;
+
+ for_each_node(node) {
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ mz = mem_cgroup_zoneinfo(memcg, node, zone);
+ mctz = soft_limit_tree_node_zone(node, zone);
+ mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ }
+ }
+}
+
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+ struct rb_node *rightmost = NULL;
+ struct mem_cgroup_per_zone *mz;
+
+retry:
+ mz = NULL;
+ rightmost = rb_last(&mctz->rb_root);
+ if (!rightmost)
+ goto done; /* Nothing to reclaim from */
+
+ mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+ /*
+ * Remove the node now but someone else can add it back,
+ * we will to add it back at the end of reclaim to its correct
+ * position in the tree.
+ */
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+ !css_tryget(&mz->memcg->css))
+ goto retry;
+done:
+ return mz;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ spin_lock(&mctz->lock);
+ mz = __mem_cgroup_largest_soft_limit_node(mctz);
+ spin_unlock(&mctz->lock);
+ return mz;
+}
+
/*
* Implementation Note: reading percpu statistics for memcg.
*
@@ -698,6 +905,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
unsigned long val = 0;
int cpu;
+ get_online_cpus();
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
@@ -705,6 +913,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
val += memcg->nocpu_base.events[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
+ put_online_cpus();
return val;
}
@@ -822,48 +1031,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
}
/*
- * Called from rate-limited memcg_check_events when enough
- * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
- * that all the parents up the hierarchy will be notified that this group
- * is in excess or that it is not in excess anymore. mmecg->soft_contributed
- * makes the transition a single action whenever the state flips from one to
- * the other.
- */
-static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
-{
- unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
- struct mem_cgroup *parent = memcg;
- int delta = 0;
-
- spin_lock(&memcg->soft_lock);
- if (excess) {
- if (!memcg->soft_contributed) {
- delta = 1;
- memcg->soft_contributed = true;
- }
- } else {
- if (memcg->soft_contributed) {
- delta = -1;
- memcg->soft_contributed = false;
- }
- }
-
- /*
- * Necessary to update all ancestors when hierarchy is used
- * because their event counter is not touched.
- * We track children even outside the hierarchy for the root
- * cgroup because tree walk starting at root should visit
- * all cgroups and we want to prevent from pointless tree
- * walk if no children is below the limit.
- */
- while (delta && (parent = parent_mem_cgroup(parent)))
- atomic_add(delta, &parent->children_in_excess);
- if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
- atomic_add(delta, &root_mem_cgroup->children_in_excess);
- spin_unlock(&memcg->soft_lock);
-}
-
-/*
* Check events in order.
*
*/
@@ -886,7 +1053,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
- mem_cgroup_update_soft_limit(memcg);
+ mem_cgroup_update_tree(memcg, page);
#if MAX_NUMNODES > 1
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1096,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
return memcg;
}
-static enum mem_cgroup_filter_t
-mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
- mem_cgroup_iter_filter cond)
-{
- if (!cond)
- return VISIT;
- return cond(memcg, root);
-}
-
/*
* Returns a next (in a pre-order walk) alive memcg (with elevated css
* ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1103,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
* helper function to be used by mem_cgroup_iter
*/
static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
- struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
+ struct mem_cgroup *last_visited)
{
struct cgroup_subsys_state *prev_css, *next_css;
@@ -959,36 +1117,22 @@ skip_node:
* skipped and we should continue the tree walk.
* last_visited css is safe to use because it is
* protected by css_get and the tree walk is rcu safe.
+ *
+ * We do not take a reference on the root of the tree walk
+ * because we might race with the root removal when it would
+ * be the only node in the iterated hierarchy and mem_cgroup_iter
+ * would end up in an endless loop because it expects that at
+ * least one valid node will be returned. Root cannot disappear
+ * because caller of the iterator should hold it already so
+ * skipping css reference should be safe.
*/
if (next_css) {
- struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
-
- switch (mem_cgroup_filter(mem, root, cond)) {
- case SKIP:
- prev_css = next_css;
- goto skip_node;
- case SKIP_TREE:
- if (mem == root)
- return NULL;
- /*
- * css_rightmost_descendant is not an optimal way to
- * skip through a subtree (especially for imbalanced
- * trees leaning to right) but that's what we have right
- * now. More effective solution would be traversing
- * right-up for first non-NULL without calling
- * css_next_descendant_pre afterwards.
- */
- prev_css = css_rightmost_descendant(next_css);
- goto skip_node;
- case VISIT:
- if (css_tryget(&mem->css))
- return mem;
- else {
- prev_css = next_css;
- goto skip_node;
- }
- break;
- }
+ if ((next_css == &root->css) ||
+ ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
+ return mem_cgroup_from_css(next_css);
+
+ prev_css = next_css;
+ goto skip_node;
}
return NULL;
@@ -1022,7 +1166,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
if (iter->last_dead_count == *sequence) {
smp_rmb();
position = iter->last_visited;
- if (position && !css_tryget(&position->css))
+
+ /*
+ * We cannot take a reference to root because we might race
+ * with root removal and returning NULL would end up in
+ * an endless loop on the iterator user level when root
+ * would be returned all the time.
+ */
+ if (position && position != root &&
+ !css_tryget(&position->css))
position = NULL;
}
return position;
@@ -1031,9 +1183,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
struct mem_cgroup *last_visited,
struct mem_cgroup *new_position,
+ struct mem_cgroup *root,
int sequence)
{
- if (last_visited)
+ /* root reference counting symmetric to mem_cgroup_iter_load */
+ if (last_visited && last_visited != root)
css_put(&last_visited->css);
/*
* We store the sequence count from the time @last_visited was
@@ -1051,7 +1205,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
* @reclaim: cookie for shared reclaim walks, NULL for full walks
- * @cond: filter for visited nodes, NULL for no filter
*
* Returns references to children of the hierarchy below @root, or
* @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1217,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
* divide up the memcgs in the hierarchy among all concurrent
* reclaimers operating on the same zone and priority.
*/
-struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
- struct mem_cgroup_reclaim_cookie *reclaim,
- mem_cgroup_iter_filter cond)
+ struct mem_cgroup_reclaim_cookie *reclaim)
{
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *last_visited = NULL;
- if (mem_cgroup_disabled()) {
- /* first call must return non-NULL, second return NULL */
- return (struct mem_cgroup *)(unsigned long)!prev;
- }
+ if (mem_cgroup_disabled())
+ return NULL;
if (!root)
root = root_mem_cgroup;
@@ -1086,9 +1236,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
goto out_css_put;
- if (mem_cgroup_filter(root, root, cond) == VISIT)
- return root;
- return NULL;
+ return root;
}
rcu_read_lock();
@@ -1111,10 +1259,11 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
last_visited = mem_cgroup_iter_load(iter, root, &seq);
}
- memcg = __mem_cgroup_iter_next(root, last_visited, cond);
+ memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- mem_cgroup_iter_update(iter, last_visited, memcg, seq);
+ mem_cgroup_iter_update(iter, last_visited, memcg, root,
+ seq);
if (!memcg)
iter->generation++;
@@ -1122,11 +1271,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
reclaim->generation = iter->generation;
}
- /*
- * We have finished the whole tree walk or no group has been
- * visited because filter told us to skip the root node.
- */
- if (!memcg && (prev || (cond && !last_visited)))
+ if (prev && !memcg)
goto out_unlock;
}
out_unlock:
@@ -1318,7 +1463,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;
- return css_is_ancestor(&memcg->css, &root_memcg->css);
+ return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@ -1538,13 +1683,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct cgroup *task_cgrp;
- struct cgroup *mem_cgrp;
/*
- * Need a buffer in BSS, can't rely on allocations. The code relies
- * on the assumption that OOM is serialized for memory controller.
- * If this assumption is broken, revisit this code.
+ * protects memcg_name and makes sure that parallel ooms do not
+ * interleave
*/
+ static DEFINE_MUTEX(oom_info_lock);
+ struct cgroup *task_cgrp;
+ struct cgroup *mem_cgrp;
static char memcg_name[PATH_MAX];
int ret;
struct mem_cgroup *iter;
@@ -1553,6 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
if (!p)
return;
+ mutex_lock(&oom_info_lock);
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
@@ -1621,6 +1767,7 @@ done:
pr_cont("\n");
}
+ mutex_unlock(&oom_info_lock);
}
/*
@@ -1713,13 +1860,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
- if (points > chosen_points) {
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = points;
- get_task_struct(chosen);
- }
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points &&
+ thread_group_leader(chosen))
+ continue;
+
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = points;
+ get_task_struct(chosen);
}
css_task_iter_end(&it);
}
@@ -1767,7 +1919,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
return total;
}
-#if MAX_NUMNODES > 1
/**
* test_mem_cgroup_node_reclaimable
* @memcg: the target memcg
@@ -1790,6 +1941,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
return false;
}
+#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
@@ -1857,52 +2009,112 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
return node;
}
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+{
+ int nid;
+
+ /*
+ * quick check...making use of scan_node.
+ * We can skip unused nodes.
+ */
+ if (!nodes_empty(memcg->scan_nodes)) {
+ for (nid = first_node(memcg->scan_nodes);
+ nid < MAX_NUMNODES;
+ nid = next_node(nid, memcg->scan_nodes)) {
+
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+ return true;
+ }
+ }
+ /*
+ * Check rest of nodes.
+ */
+ for_each_node_state(nid, N_MEMORY) {
+ if (node_isset(nid, memcg->scan_nodes))
+ continue;
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+ return true;
+ }
+ return false;
+}
+
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
-#endif
-
-/*
- * A group is eligible for the soft limit reclaim under the given root
- * hierarchy if
- * a) it is over its soft limit
- * b) any parent up the hierarchy is over its soft limit
- *
- * If the given group doesn't have any children over the limit then it
- * doesn't make any sense to iterate its subtree.
- */
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
- struct mem_cgroup *root)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
- struct mem_cgroup *parent;
-
- if (!memcg)
- memcg = root_mem_cgroup;
- parent = memcg;
-
- if (res_counter_soft_limit_excess(&memcg->res))
- return VISIT;
+ return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
+}
+#endif
- /*
- * If any parent up to the root in the hierarchy is over its soft limit
- * then we have to obey and reclaim from this group as well.
- */
- while ((parent = parent_mem_cgroup(parent))) {
- if (res_counter_soft_limit_excess(&parent->res))
- return VISIT;
- if (parent == root)
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+ struct zone *zone,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
+{
+ struct mem_cgroup *victim = NULL;
+ int total = 0;
+ int loop = 0;
+ unsigned long excess;
+ unsigned long nr_scanned;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .zone = zone,
+ .priority = 0,
+ };
+
+ excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+
+ while (1) {
+ victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+ if (!victim) {
+ loop++;
+ if (loop >= 2) {
+ /*
+ * If we have not been able to reclaim
+ * anything, it might because there are
+ * no reclaimable pages under this hierarchy
+ */
+ if (!total)
+ break;
+ /*
+ * We want to do more targeted reclaim.
+ * excess >> 2 is not to excessive so as to
+ * reclaim too much, nor too less that we keep
+ * coming back to reclaim from this cgroup
+ */
+ if (total >= (excess >> 2) ||
+ (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
+ break;
+ }
+ continue;
+ }
+ if (!mem_cgroup_reclaimable(victim, false))
+ continue;
+ total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+ zone, &nr_scanned);
+ *total_scanned += nr_scanned;
+ if (!res_counter_soft_limit_excess(&root_memcg->res))
break;
}
-
- if (!atomic_read(&memcg->children_in_excess))
- return SKIP_TREE;
- return SKIP;
+ mem_cgroup_iter_break(root_memcg, victim);
+ return total;
}
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map memcg_oom_lock_dep_map = {
+ .name = "memcg_oom_lock",
+};
+#endif
+
static DEFINE_SPINLOCK(memcg_oom_lock);
/*
@@ -1940,7 +2152,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
}
iter->oom_lock = false;
}
- }
+ } else
+ mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
spin_unlock(&memcg_oom_lock);
@@ -1952,6 +2165,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
+ mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
@@ -2018,110 +2232,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
memcg_wakeup_oom(memcg);
}
-/*
- * try to call OOM killer
- */
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- bool locked;
- int wakeups;
-
if (!current->memcg_oom.may_oom)
return;
-
- current->memcg_oom.in_memcg_oom = 1;
-
/*
- * As with any blocking lock, a contender needs to start
- * listening for wakeups before attempting the trylock,
- * otherwise it can miss the wakeup from the unlock and sleep
- * indefinitely. This is just open-coded because our locking
- * is so particular to memcg hierarchies.
+ * We are in the middle of the charge context here, so we
+ * don't want to block when potentially sitting on a callstack
+ * that holds all kinds of filesystem and mm locks.
+ *
+ * Also, the caller may handle a failed allocation gracefully
+ * (like optional page cache readahead) and so an OOM killer
+ * invocation might not even be necessary.
+ *
+ * That's why we don't do anything here except remember the
+ * OOM context and then deal with it at the end of the page
+ * fault when the stack is unwound, the locks are released,
+ * and when we know whether the fault was overall successful.
*/
- wakeups = atomic_read(&memcg->oom_wakeups);
- mem_cgroup_mark_under_oom(memcg);
-
- locked = mem_cgroup_oom_trylock(memcg);
-
- if (locked)
- mem_cgroup_oom_notify(memcg);
-
- if (locked && !memcg->oom_kill_disable) {
- mem_cgroup_unmark_under_oom(memcg);
- mem_cgroup_out_of_memory(memcg, mask, order);
- mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitely.
- */
- memcg_oom_recover(memcg);
- } else {
- /*
- * A system call can just return -ENOMEM, but if this
- * is a page fault and somebody else is handling the
- * OOM already, we need to sleep on the OOM waitqueue
- * for this memcg until the situation is resolved.
- * Which can take some time because it might be
- * handled by a userspace task.
- *
- * However, this is the charge context, which means
- * that we may sit on a large call stack and hold
- * various filesystem locks, the mmap_sem etc. and we
- * don't want the OOM handler to deadlock on them
- * while we sit here and wait. Store the current OOM
- * context in the task_struct, then return -ENOMEM.
- * At the end of the page fault handler, with the
- * stack unwound, pagefault_out_of_memory() will check
- * back with us by calling
- * mem_cgroup_oom_synchronize(), possibly putting the
- * task to sleep.
- */
- current->memcg_oom.oom_locked = locked;
- current->memcg_oom.wakeups = wakeups;
- css_get(&memcg->css);
- current->memcg_oom.wait_on_memcg = memcg;
- }
+ css_get(&memcg->css);
+ current->memcg_oom.memcg = memcg;
+ current->memcg_oom.gfp_mask = mask;
+ current->memcg_oom.order = order;
}
/**
* mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
*
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
*
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
* sleep on a waitqueue until the userspace task resolves the
* situation. Sleeping directly in the charge context with all kinds
* of locks held is not a good idea, instead we remember an OOM state
* in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
*
* Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
*/
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
{
+ struct mem_cgroup *memcg = current->memcg_oom.memcg;
struct oom_wait_info owait;
- struct mem_cgroup *memcg;
+ bool locked;
/* OOM is global, do not handle */
- if (!current->memcg_oom.in_memcg_oom)
- return false;
-
- /*
- * We invoked the OOM killer but there is a chance that a kill
- * did not free up any charges. Everybody else might already
- * be sleeping, so restart the fault and keep the rampage
- * going until some charges are released.
- */
- memcg = current->memcg_oom.wait_on_memcg;
if (!memcg)
- goto out;
+ return false;
- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
- goto out_memcg;
+ if (!handle)
+ goto cleanup;
owait.memcg = memcg;
owait.wait.flags = 0;
@@ -2130,13 +2293,25 @@ bool mem_cgroup_oom_synchronize(void)
INIT_LIST_HEAD(&owait.wait.task_list);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
- /* Only sleep if we didn't miss any wakeups since OOM */
- if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ if (locked && !memcg->oom_kill_disable) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+ current->memcg_oom.order);
+ } else {
schedule();
- finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
- mem_cgroup_unmark_under_oom(memcg);
- if (current->memcg_oom.oom_locked) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ }
+
+ if (locked) {
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
@@ -2145,10 +2320,9 @@ out_memcg:
*/
memcg_oom_recover(memcg);
}
+cleanup:
+ current->memcg_oom.memcg = NULL;
css_put(&memcg->css);
- current->memcg_oom.wait_on_memcg = NULL;
-out:
- current->memcg_oom.in_memcg_oom = 0;
return true;
}
@@ -2562,6 +2736,12 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
|| fatal_signal_pending(current)))
goto bypass;
+ if (unlikely(task_in_memcg_oom(current)))
+ goto nomem;
+
+ if (gfp_mask & __GFP_NOFAIL)
+ oom = false;
+
/*
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
@@ -2659,8 +2839,10 @@ done:
*ptr = memcg;
return 0;
nomem:
- *ptr = NULL;
- return -ENOMEM;
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ *ptr = NULL;
+ return -ENOMEM;
+ }
bypass:
*ptr = root_mem_cgroup;
return -EINTR;
@@ -2709,15 +2891,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
- struct cgroup_subsys_state *css;
-
/* ID 0 is unused ID */
if (!id)
return NULL;
- css = css_lookup(&mem_cgroup_subsys, id);
- if (!css)
- return NULL;
- return mem_cgroup_from_css(css);
+ return mem_cgroup_from_id(id);
}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2727,7 +2904,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
unsigned short id;
swp_entry_t ent;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
@@ -2761,7 +2938,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
bool anon;
lock_page_cgroup(pc);
- VM_BUG_ON(PageCgroupUsed(pc));
+ VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
@@ -2796,7 +2973,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
if (lrucare) {
if (was_on_lru) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
@@ -2812,7 +2989,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
unlock_page_cgroup(pc);
/*
- * "charge_statistics" updated event counter.
+ * "charge_statistics" updated event counter. Then, check it.
+ * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+ * if they exceeds softlimit.
*/
memcg_check_events(memcg, page);
}
@@ -2820,10 +2999,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
static DEFINE_MUTEX(set_limit_mutex);
#ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
+
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+ memcg_kmem_is_active(memcg);
}
/*
@@ -2836,14 +3017,13 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
- return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+ return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
}
#ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
+static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct memcg_cache_params *params;
if (!memcg_can_account_kmem(memcg))
@@ -2865,21 +3045,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
struct res_counter *fail_res;
struct mem_cgroup *_memcg;
int ret = 0;
- bool may_oom;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- /*
- * Conditions under which we can wait for the oom_killer. Those are
- * the same conditions tested by the core page allocator
- */
- may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
-
_memcg = memcg;
ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, may_oom);
+ &_memcg, oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
@@ -2930,16 +3103,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
css_put(&memcg->css);
}
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
- if (!memcg)
- return;
-
- mutex_lock(&memcg->slab_caches_mutex);
- list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
- mutex_unlock(&memcg->slab_caches_mutex);
-}
-
/*
* helper for acessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
@@ -2950,43 +3113,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
- int num, ret;
-
- num = ida_simple_get(&kmem_limited_groups,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (num < 0)
- return num;
- /*
- * After this point, kmem_accounted (that we test atomically in
- * the beginning of this conditional), is no longer 0. This
- * guarantees only one process will set the following boolean
- * to true. We don't need test_and_set because we're protected
- * by the set_limit_mutex anyway.
- */
- memcg_kmem_set_activated(memcg);
-
- ret = memcg_update_all_caches(num+1);
- if (ret) {
- ida_simple_remove(&kmem_limited_groups, num);
- memcg_kmem_clear_activated(memcg);
- return ret;
- }
-
- memcg->kmemcg_id = num;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
- mutex_init(&memcg->slab_caches_mutex);
- return 0;
-}
-
static size_t memcg_caches_array_size(int num_groups)
{
ssize_t size;
@@ -3019,22 +3145,21 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
- VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+ VM_BUG_ON(!is_root_cache(s));
if (num_groups > memcg_limited_groups_array_size) {
int i;
+ struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params) {
- s->memcg_params = cur_params;
+ new_params = kzalloc(size, GFP_KERNEL);
+ if (!new_params)
return -ENOMEM;
- }
- s->memcg_params->is_root_cache = true;
+ new_params->is_root_cache = true;
/*
* There is the chance it will be bigger than
@@ -3048,7 +3173,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
- s->memcg_params->memcg_caches[i] =
+ new_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
@@ -3061,13 +3186,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
* bigger than the others. And all updates will reset this
* anyway.
*/
- kfree(cur_params);
+ rcu_assign_pointer(s->memcg_params, new_params);
+ if (cur_params)
+ kfree_rcu(cur_params, rcu_head);
}
return 0;
}
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
- struct kmem_cache *root_cache)
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
{
size_t size;
@@ -3095,35 +3222,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
return 0;
}
-void memcg_release_cache(struct kmem_cache *s)
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+ kfree(s->memcg_params);
+}
+
+void memcg_register_cache(struct kmem_cache *s)
{
struct kmem_cache *root;
struct mem_cgroup *memcg;
int id;
- /*
- * This happens, for instance, when a root cache goes away before we
- * add any memcg.
- */
- if (!s->memcg_params)
+ if (is_root_cache(s))
return;
- if (s->memcg_params->is_root_cache)
- goto out;
+ /*
+ * Holding the slab_mutex assures nobody will touch the memcg_caches
+ * array while we are modifying it.
+ */
+ lockdep_assert_held(&slab_mutex);
+ root = s->memcg_params->root_cache;
memcg = s->memcg_params->memcg;
- id = memcg_cache_id(memcg);
+ id = memcg_cache_id(memcg);
+
+ css_get(&memcg->css);
+
+
+ /*
+ * Since readers won't lock (see cache_from_memcg_idx()), we need a
+ * barrier here to ensure nobody will see the kmem_cache partially
+ * initialized.
+ */
+ smp_wmb();
+
+ /*
+ * Initialize the pointer to this cache in its parent's memcg_params
+ * before adding it to the memcg_slab_caches list, otherwise we can
+ * fail to convert memcg_params_to_cache() while traversing the list.
+ */
+ VM_BUG_ON(root->memcg_params->memcg_caches[id]);
+ root->memcg_params->memcg_caches[id] = s;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+void memcg_unregister_cache(struct kmem_cache *s)
+{
+ struct kmem_cache *root;
+ struct mem_cgroup *memcg;
+ int id;
+
+ if (is_root_cache(s))
+ return;
+
+ /*
+ * Holding the slab_mutex assures nobody will touch the memcg_caches
+ * array while we are modifying it.
+ */
+ lockdep_assert_held(&slab_mutex);
root = s->memcg_params->root_cache;
- root->memcg_params->memcg_caches[id] = NULL;
+ memcg = s->memcg_params->memcg;
+ id = memcg_cache_id(memcg);
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
+ /*
+ * Clear the pointer to this cache in its parent's memcg_params only
+ * after removing it from the memcg_slab_caches list, otherwise we can
+ * fail to convert memcg_params_to_cache() while traversing the list.
+ */
+ VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
+ root->memcg_params->memcg_caches[id] = NULL;
+
css_put(&memcg->css);
-out:
- kfree(s->memcg_params);
}
/*
@@ -3182,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)
* So if we aren't down to zero, we'll just schedule a worker and try
* again
*/
- if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+ if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
kmem_cache_shrink(cachep);
- if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
- return;
- } else
+ else
kmem_cache_destroy(cachep);
}
@@ -3222,27 +3397,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
schedule_work(&cachep->memcg_params->destroy);
}
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
- struct kmem_cache *s)
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *s)
{
- struct kmem_cache *new;
+ struct kmem_cache *new = NULL;
static char *tmp_name = NULL;
+ static DEFINE_MUTEX(mutex); /* protects tmp_name */
- lockdep_assert_held(&memcg_cache_mutex);
+ BUG_ON(!memcg_can_account_kmem(memcg));
+ mutex_lock(&mutex);
/*
* kmem_cache_create_memcg duplicates the given name and
* cgroup_name for this name requires RCU context.
@@ -3252,7 +3416,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
if (!tmp_name) {
tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!tmp_name)
- return NULL;
+ goto out;
}
rcu_read_lock();
@@ -3262,48 +3426,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
(s->flags & ~SLAB_PANIC), s->ctor, s);
-
if (new)
new->allocflags |= __GFP_KMEMCG;
-
- return new;
-}
-
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct kmem_cache *new_cachep;
- int idx;
-
- BUG_ON(!memcg_can_account_kmem(memcg));
-
- idx = memcg_cache_id(memcg);
-
- mutex_lock(&memcg_cache_mutex);
- new_cachep = cachep->memcg_params->memcg_caches[idx];
- if (new_cachep) {
- css_put(&memcg->css);
- goto out;
- }
-
- new_cachep = kmem_cache_dup(memcg, cachep);
- if (new_cachep == NULL) {
- new_cachep = cachep;
- css_put(&memcg->css);
- goto out;
- }
-
- atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-
- cachep->memcg_params->memcg_caches[idx] = new_cachep;
- /*
- * the readers won't lock, make sure everybody sees the updated value,
- * so they won't put stuff in the queue again for no reason
- */
- wmb();
+ else
+ new = s;
out:
- mutex_unlock(&memcg_cache_mutex);
- return new_cachep;
+ mutex_unlock(&mutex);
+ return new;
}
void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
@@ -3323,11 +3452,12 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
*
* Still, we don't want anyone else freeing memcg_caches under our
* noses, which can happen if a new memcg comes to life. As usual,
- * we'll take the set_limit_mutex to protect ourselves against this.
+ * we'll take the activate_kmem_mutex to protect ourselves against
+ * this.
*/
- mutex_lock(&set_limit_mutex);
- for (i = 0; i < memcg_limited_groups_array_size; i++) {
- c = s->memcg_params->memcg_caches[i];
+ mutex_lock(&activate_kmem_mutex);
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
if (!c)
continue;
@@ -3348,7 +3478,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
}
- mutex_unlock(&set_limit_mutex);
+ mutex_unlock(&activate_kmem_mutex);
}
struct create_work {
@@ -3380,6 +3510,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
cw = container_of(w, struct create_work, work);
memcg_create_kmem_cache(cw->memcg, cw->cachep);
+ css_put(&cw->memcg->css);
kfree(cw);
}
@@ -3439,7 +3570,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
gfp_t gfp)
{
struct mem_cgroup *memcg;
- int idx;
+ struct kmem_cache *memcg_cachep;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3453,15 +3584,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
if (!memcg_can_account_kmem(memcg))
goto out;
- idx = memcg_cache_id(memcg);
-
- /*
- * barrier to mare sure we're always seeing the up to date value. The
- * code updating memcg_caches will issue a write barrier to match this.
- */
- read_barrier_depends();
- if (likely(cachep->memcg_params->memcg_caches[idx])) {
- cachep = cachep->memcg_params->memcg_caches[idx];
+ memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ if (likely(memcg_cachep)) {
+ cachep = memcg_cachep;
goto out;
}
@@ -3615,7 +3740,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
if (!memcg)
return;
- VM_BUG_ON(mem_cgroup_is_root(memcg));
+ VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
#else
@@ -3663,8 +3788,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
{
/* Update stat data for mem_cgroup */
preempt_disable();
- WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
- __this_cpu_add(from->stat->count[idx], -nr_pages);
+ __this_cpu_sub(from->stat->count[idx], nr_pages);
__this_cpu_add(to->stat->count[idx], nr_pages);
preempt_enable();
}
@@ -3695,7 +3819,7 @@ static int mem_cgroup_move_account(struct page *page,
bool anon = PageAnon(page);
VM_BUG_ON(from == to);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
/*
* The page is isolated from LRU. So, collapse function
* will not handle this page. But page splitting can happen.
@@ -3788,7 +3912,7 @@ static int mem_cgroup_move_parent(struct page *page,
parent = root_mem_cgroup;
if (nr_pages > 1) {
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
flags = compound_lock_irqsave(page);
}
@@ -3822,7 +3946,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/*
* Never OOM-kill a process for a huge page. The
* fault handler will fall back to regular pages.
@@ -3842,8 +3966,8 @@ int mem_cgroup_newpage_charge(struct page *page,
{
if (mem_cgroup_disabled())
return 0;
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
VM_BUG_ON(!mm);
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_ANON);
@@ -4047,7 +4171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
}
/*
* Check if our page_cgroup is valid
@@ -4139,7 +4263,7 @@ void mem_cgroup_uncharge_page(struct page *page)
/* early check. */
if (page_mapped(page))
return;
- VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
/*
* If the page is in swap cache, uncharge should be deferred
* to the swap path, which also properly accounts swap usage
@@ -4159,8 +4283,8 @@ void mem_cgroup_uncharge_page(struct page *page)
void mem_cgroup_uncharge_cache_page(struct page *page)
{
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping);
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping, page);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
}
@@ -4232,7 +4356,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
* css_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
- swap_cgroup_record(ent, css_id(&memcg->css));
+ swap_cgroup_record(ent, mem_cgroup_id(memcg));
}
#endif
@@ -4284,8 +4408,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
{
unsigned short old_id, new_id;
- old_id = css_id(&from->css);
- new_id = css_id(&to->css);
+ old_id = mem_cgroup_id(from);
+ new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
@@ -4647,6 +4771,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
return ret;
}
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
+{
+ unsigned long nr_reclaimed = 0;
+ struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+ unsigned long reclaimed;
+ int loop = 0;
+ struct mem_cgroup_tree_per_zone *mctz;
+ unsigned long long excess;
+ unsigned long nr_scanned;
+
+ if (order > 0)
+ return 0;
+
+ mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
+ /*
+ * This loop can run a while, specially if mem_cgroup's continuously
+ * keep exceeding their soft limit and putting the system under
+ * pressure
+ */
+ do {
+ if (next_mz)
+ mz = next_mz;
+ else
+ mz = mem_cgroup_largest_soft_limit_node(mctz);
+ if (!mz)
+ break;
+
+ nr_scanned = 0;
+ reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
+ gfp_mask, &nr_scanned);
+ nr_reclaimed += reclaimed;
+ *total_scanned += nr_scanned;
+ spin_lock(&mctz->lock);
+
+ /*
+ * If we failed to reclaim anything from this memory cgroup
+ * it is time to move on to the next cgroup
+ */
+ next_mz = NULL;
+ if (!reclaimed) {
+ do {
+ /*
+ * Loop until we find yet another one.
+ *
+ * By the time we get the soft_limit lock
+ * again, someone might have aded the
+ * group back on the RB tree. Iterate to
+ * make sure we get a different mem.
+ * mem_cgroup_largest_soft_limit_node returns
+ * NULL if no other cgroup is present on
+ * the tree
+ */
+ next_mz =
+ __mem_cgroup_largest_soft_limit_node(mctz);
+ if (next_mz == mz)
+ css_put(&next_mz->memcg->css);
+ else /* next_mz == NULL or other memcg */
+ break;
+ } while (1);
+ }
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ excess = res_counter_soft_limit_excess(&mz->memcg->res);
+ /*
+ * One school of thought says that we should not add
+ * back the node to the tree if reclaim returns 0.
+ * But our reclaim could return 0, simply because due
+ * to priority we are exposing a smaller subset of
+ * memory to reclaim from. Consider this as a longer
+ * term TODO.
+ */
+ /* If excess == 0, no tree ops */
+ __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
+ spin_unlock(&mctz->lock);
+ css_put(&mz->memcg->css);
+ loop++;
+ /*
+ * Could not reclaim anything and there are no more
+ * mem cgroups to try or we seem to be looping without
+ * reclaiming anything.
+ */
+ if (!nr_reclaimed &&
+ (next_mz == NULL ||
+ loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+ break;
+ } while (!nr_reclaimed);
+ if (next_mz)
+ css_put(&next_mz->memcg->css);
+ return nr_reclaimed;
+}
+
/**
* mem_cgroup_force_empty_list - clears LRU of a group
* @memcg: group to clear
@@ -4748,31 +4964,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
} while (usage > 0);
}
-/*
- * This mainly exists for tests during the setting of set of use_hierarchy.
- * Since this is the very setting we are changing, the current hierarchy value
- * is meaningless
- */
-static inline bool __memcg_has_children(struct mem_cgroup *memcg)
-{
- struct cgroup_subsys_state *pos;
-
- /* bounce at first found */
- css_for_each_child(pos, &memcg->css)
- return true;
- return false;
-}
-
-/*
- * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
- * to be already dead (as in mem_cgroup_force_empty, for instance). This is
- * from mem_cgroup_count_children(), in the sense that we don't really care how
- * many children we have; we only need to know if we have any. It also counts
- * any memcg without hierarchy as infertile.
- */
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
- return memcg->use_hierarchy && __memcg_has_children(memcg);
+ lockdep_assert_held(&memcg_create_mutex);
+ /*
+ * The lock does not prevent addition or deletion to the list
+ * of children, but it prevents a new child from being
+ * initialized based on this parent in css_online(), so it's
+ * enough to decide whether hierarchically inherited
+ * attributes can still be changed or not.
+ */
+ return memcg->use_hierarchy &&
+ !list_empty(&memcg->css.cgroup->children);
}
/*
@@ -4852,7 +5055,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
*/
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
- if (!__memcg_has_children(memcg))
+ if (list_empty(&memcg->css.cgroup->children))
memcg->use_hierarchy = val;
else
retval = -EBUSY;
@@ -4905,14 +5108,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
return val << PAGE_SHIFT;
}
-static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- char __user *buf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- char str[64];
u64 val;
- int name, len;
+ int name;
enum res_type type;
type = MEMFILE_TYPE(cft->private);
@@ -4938,15 +5139,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
BUG();
}
- len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
- return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+ return val;
}
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
- int ret = -EINVAL;
#ifdef CONFIG_MEMCG_KMEM
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int err = 0;
+ int memcg_id;
+
+ if (memcg_kmem_is_active(memcg))
+ return 0;
+
+ /*
+ * We are going to allocate memory for data shared by all memory
+ * cgroups so let's stop accounting here.
+ */
+ memcg_stop_kmem_account();
+
/*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
@@ -4960,72 +5172,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
* of course permitted.
*/
mutex_lock(&memcg_create_mutex);
- mutex_lock(&set_limit_mutex);
- if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
- if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
- ret = -EBUSY;
- goto out;
- }
- ret = res_counter_set_limit(&memcg->kmem, val);
- VM_BUG_ON(ret);
+ if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
+ err = -EBUSY;
+ mutex_unlock(&memcg_create_mutex);
+ if (err)
+ goto out;
- ret = memcg_update_cache_sizes(memcg);
- if (ret) {
- res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
- goto out;
- }
- static_key_slow_inc(&memcg_kmem_enabled_key);
- /*
- * setting the active bit after the inc will guarantee no one
- * starts accounting before all call sites are patched
- */
- memcg_kmem_set_active(memcg);
- } else
- ret = res_counter_set_limit(&memcg->kmem, val);
+ memcg_id = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (memcg_id < 0) {
+ err = memcg_id;
+ goto out;
+ }
+
+ /*
+ * Make sure we have enough space for this cgroup in each root cache's
+ * memcg_params.
+ */
+ err = memcg_update_all_caches(memcg_id + 1);
+ if (err)
+ goto out_rmid;
+
+ memcg->kmemcg_id = memcg_id;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+ mutex_init(&memcg->slab_caches_mutex);
+
+ /*
+ * We couldn't have accounted to this cgroup, because it hasn't got the
+ * active bit set yet, so this should succeed.
+ */
+ err = res_counter_set_limit(&memcg->kmem, limit);
+ VM_BUG_ON(err);
+
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * Setting the active bit after enabling static branching will
+ * guarantee no one starts accounting before all call sites are
+ * patched.
+ */
+ memcg_kmem_set_active(memcg);
out:
- mutex_unlock(&set_limit_mutex);
- mutex_unlock(&memcg_create_mutex);
-#endif
+ memcg_resume_kmem_account();
+ return err;
+
+out_rmid:
+ ida_simple_remove(&kmem_limited_groups, memcg_id);
+ goto out;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int ret;
+
+ mutex_lock(&activate_kmem_mutex);
+ ret = __memcg_activate_kmem(memcg, limit);
+ mutex_unlock(&activate_kmem_mutex);
+ return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ int ret;
+
+ if (!memcg_kmem_is_active(memcg))
+ ret = memcg_activate_kmem(memcg, val);
+ else
+ ret = res_counter_set_limit(&memcg->kmem, val);
return ret;
}
-#ifdef CONFIG_MEMCG_KMEM
static int memcg_propagate_kmem(struct mem_cgroup *memcg)
{
int ret = 0;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- if (!parent)
- goto out;
- memcg->kmem_account_flags = parent->kmem_account_flags;
- /*
- * When that happen, we need to disable the static branch only on those
- * memcgs that enabled it. To achieve this, we would be forced to
- * complicate the code by keeping track of which memcgs were the ones
- * that actually enabled limits, and which ones got it from its
- * parents.
- *
- * It is a lot simpler just to do static_key_slow_inc() on every child
- * that is accounted.
- */
- if (!memcg_kmem_is_active(memcg))
- goto out;
+ if (!parent)
+ return 0;
+ mutex_lock(&activate_kmem_mutex);
/*
- * __mem_cgroup_free() will issue static_key_slow_dec() because this
- * memcg is active already. If the later initialization fails then the
- * cgroup core triggers the cleanup so we do not have to do it here.
+ * If the parent cgroup is not kmem-active now, it cannot be activated
+ * after this point, because it has at least one child already.
*/
- static_key_slow_inc(&memcg_kmem_enabled_key);
-
- mutex_lock(&set_limit_mutex);
- memcg_stop_kmem_account();
- ret = memcg_update_cache_sizes(memcg);
- memcg_resume_kmem_account();
- mutex_unlock(&set_limit_mutex);
-out:
+ if (memcg_kmem_is_active(parent))
+ ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+ mutex_unlock(&activate_kmem_mutex);
return ret;
}
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_MEMCG_KMEM */
/*
@@ -5059,7 +5300,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
else if (type == _MEMSWAP)
ret = mem_cgroup_resize_memsw_limit(memcg, val);
else if (type == _KMEM)
- ret = memcg_update_kmem_limit(css, val);
+ ret = memcg_update_kmem_limit(memcg, val);
else
return -EINVAL;
break;
@@ -5176,48 +5417,52 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
-static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
-{
+static int memcg_numa_stat_show(struct seq_file *m, void *v)
+{
+ struct numa_stat {
+ const char *name;
+ unsigned int lru_mask;
+ };
+
+ static const struct numa_stat stats[] = {
+ { "total", LRU_ALL },
+ { "file", LRU_ALL_FILE },
+ { "anon", LRU_ALL_ANON },
+ { "unevictable", BIT(LRU_UNEVICTABLE) },
+ };
+ const struct numa_stat *stat;
int nid;
- unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
- unsigned long node_nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-
- total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
- seq_printf(m, "total=%lu", total_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
-
- file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
- seq_printf(m, "file=%lu", file_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- LRU_ALL_FILE);
- seq_printf(m, " N%d=%lu", nid, node_nr);
+ unsigned long nr;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
+ seq_printf(m, "%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
}
- seq_putc(m, '\n');
- anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
- seq_printf(m, "anon=%lu", anon_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- LRU_ALL_ANON);
- seq_printf(m, " N%d=%lu", nid, node_nr);
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ struct mem_cgroup *iter;
+
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
+ seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_node_nr_lru_pages(
+ iter, nid, stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
}
- seq_putc(m, '\n');
- unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
- seq_printf(m, "unevictable=%lu", unevictable_nr);
- for_each_node_state(nid, N_MEMORY) {
- node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- BIT(LRU_UNEVICTABLE));
- seq_printf(m, " N%d=%lu", nid, node_nr);
- }
- seq_putc(m, '\n');
return 0;
}
#endif /* CONFIG_NUMA */
@@ -5227,10 +5472,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
}
-static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
- struct seq_file *m)
+static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct mem_cgroup *mi;
unsigned int i;
@@ -5439,13 +5683,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
mem_cgroup_oom_notify_cb(iter);
}
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
@@ -5522,13 +5764,23 @@ unlock:
return ret;
}
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
@@ -5601,14 +5853,23 @@ unlock:
mutex_unlock(&memcg->thresholds_lock);
}
-static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *event;
- enum res_type type = MEMFILE_TYPE(cft->private);
- BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
@@ -5626,14 +5887,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
return 0;
}
-static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *ev, *tmp;
- enum res_type type = MEMFILE_TYPE(cft->private);
-
- BUG_ON(type != _OOM_TYPE);
spin_lock(&memcg_oom_lock);
@@ -5647,17 +5904,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
spin_unlock(&memcg_oom_lock);
}
-static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct cgroup_map_cb *cb)
+static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
- cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
-
- if (atomic_read(&memcg->under_oom))
- cb->fill(cb, "under_oom", 1);
- else
- cb->fill(cb, "under_oom", 0);
+ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
return 0;
}
@@ -5750,41 +6002,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
}
#endif
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered. It tries to support fully configurable
+ * events for each user. Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+ struct mem_cgroup_event *event =
+ container_of(work, struct mem_cgroup_event, remove);
+ struct mem_cgroup *memcg = event->memcg;
+
+ remove_wait_queue(event->wqh, &event->wait);
+
+ event->unregister_event(memcg, event->eventfd);
+
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
+ eventfd_ctx_put(event->eventfd);
+ kfree(event);
+ css_put(&memcg->css);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct mem_cgroup_event *event =
+ container_of(wait, struct mem_cgroup_event, wait);
+ struct mem_cgroup *memcg = event->memcg;
+ unsigned long flags = (unsigned long)key;
+
+ if (flags & POLLHUP) {
+ /*
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
+ */
+ spin_lock(&memcg->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+ }
+
+ return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+ wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mem_cgroup_event *event =
+ container_of(pt, struct mem_cgroup_event, pt);
+
+ event->wqh = wqh;
+ add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int memcg_write_event_control(struct cgroup_subsys_state *css,
+ struct cftype *cft, const char *buffer)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event;
+ struct cgroup_subsys_state *cfile_css;
+ unsigned int efd, cfd;
+ struct fd efile;
+ struct fd cfile;
+ const char *name;
+ char *endp;
+ int ret;
+
+ efd = simple_strtoul(buffer, &endp, 10);
+ if (*endp != ' ')
+ return -EINVAL;
+ buffer = endp + 1;
+
+ cfd = simple_strtoul(buffer, &endp, 10);
+ if ((*endp != ' ') && (*endp != '\0'))
+ return -EINVAL;
+ buffer = endp + 1;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->memcg = memcg;
+ INIT_LIST_HEAD(&event->list);
+ init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+ init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+ INIT_WORK(&event->remove, memcg_event_remove);
+
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
+ goto out_kfree;
+ }
+
+ event->eventfd = eventfd_ctx_fileget(efile.file);
+ if (IS_ERR(event->eventfd)) {
+ ret = PTR_ERR(event->eventfd);
+ goto out_put_efile;
+ }
+
+ cfile = fdget(cfd);
+ if (!cfile.file) {
+ ret = -EBADF;
+ goto out_put_eventfd;
+ }
+
+ /* the process need read permission on control file */
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ if (ret < 0)
+ goto out_put_cfile;
+
+ /*
+ * Determine the event callbacks and set them in @event. This used
+ * to be done via struct cftype but cgroup core no longer knows
+ * about these events. The following is crude but the whole thing
+ * is for compatibility anyway.
+ *
+ * DO NOT ADD NEW FILES.
+ */
+ name = cfile.file->f_dentry->d_name.name;
+
+ if (!strcmp(name, "memory.usage_in_bytes")) {
+ event->register_event = mem_cgroup_usage_register_event;
+ event->unregister_event = mem_cgroup_usage_unregister_event;
+ } else if (!strcmp(name, "memory.oom_control")) {
+ event->register_event = mem_cgroup_oom_register_event;
+ event->unregister_event = mem_cgroup_oom_unregister_event;
+ } else if (!strcmp(name, "memory.pressure_level")) {
+ event->register_event = vmpressure_register_event;
+ event->unregister_event = vmpressure_unregister_event;
+ } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+ event->register_event = memsw_cgroup_usage_register_event;
+ event->unregister_event = memsw_cgroup_usage_unregister_event;
+ } else {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Verify @cfile should belong to @css. Also, remaining events are
+ * automatically removed on cgroup destruction but the removal is
+ * asynchronous, so take an extra ref on @css.
+ */
+ rcu_read_lock();
+
+ ret = -EINVAL;
+ cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+ &mem_cgroup_subsys);
+ if (cfile_css == css && css_tryget(css))
+ ret = 0;
+
+ rcu_read_unlock();
+ if (ret)
+ goto out_put_cfile;
+
+ ret = event->register_event(memcg, event->eventfd, buffer);
+ if (ret)
+ goto out_put_css;
+
+ efile.file->f_op->poll(efile.file, &event->pt);
+
+ spin_lock(&memcg->event_list_lock);
+ list_add(&event->list, &memcg->event_list);
+ spin_unlock(&memcg->event_list_lock);
+
+ fdput(cfile);
+ fdput(efile);
+
+ return 0;
+
+out_put_css:
+ css_put(css);
+out_put_cfile:
+ fdput(cfile);
+out_put_eventfd:
+ eventfd_ctx_put(event->eventfd);
+out_put_efile:
+ fdput(efile);
+out_kfree:
+ kfree(event);
+
+ return ret;
+}
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "stat",
- .read_seq_string = memcg_stat_show,
+ .seq_show = memcg_stat_show,
},
{
.name = "force_empty",
@@ -5797,6 +6269,12 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_hierarchy_read,
},
{
+ .name = "cgroup.event_control", /* XXX: for compat */
+ .write_string = memcg_write_event_control,
+ .flags = CFTYPE_NO_PREFIX,
+ .mode = S_IWUGO,
+ },
+ {
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
@@ -5808,21 +6286,17 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "oom_control",
- .read_map = mem_cgroup_oom_control_read,
+ .seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
- .register_event = mem_cgroup_oom_register_event,
- .unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
- .register_event = vmpressure_register_event,
- .unregister_event = vmpressure_unregister_event,
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
- .read_seq_string = memcg_numa_stat_show,
+ .seq_show = memcg_numa_stat_show,
},
#endif
#ifdef CONFIG_MEMCG_KMEM
@@ -5830,29 +6304,29 @@ static struct cftype mem_cgroup_files[] = {
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
- .read_seq_string = mem_cgroup_slabinfo_read,
+ .seq_show = mem_cgroup_slabinfo_read,
},
#endif
#endif
@@ -5864,27 +6338,25 @@ static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{ }, /* terminate */
};
@@ -5911,6 +6383,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
lruvec_init(&mz->lruvec);
+ mz->usage_in_excess = 0;
+ mz->on_tree = false;
mz->memcg = memcg;
}
memcg->nodeinfo[node] = pn;
@@ -5925,14 +6399,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size = memcg_size();
+ size_t size;
- /* Can be very big if nr_node_ids is very big */
- if (size < PAGE_SIZE)
- memcg = kzalloc(size, GFP_KERNEL);
- else
- memcg = vzalloc(size);
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
return NULL;
@@ -5943,10 +6415,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
return memcg;
out_free:
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
return NULL;
}
@@ -5964,9 +6433,8 @@ out_free:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- size_t size = memcg_size();
- free_css_id(&mem_cgroup_subsys, &memcg->css);
+ mem_cgroup_remove_from_trees(memcg);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
@@ -5985,10 +6453,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
* the cgroup_lock.
*/
disarm_static_keys(memcg);
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
}
/*
@@ -6002,6 +6467,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(parent_mem_cgroup);
+static void __init mem_cgroup_soft_limit_tree_init(void)
+{
+ struct mem_cgroup_tree_per_node *rtpn;
+ struct mem_cgroup_tree_per_zone *rtpz;
+ int tmp, node, zone;
+
+ for_each_node(node) {
+ tmp = node;
+ if (!node_state(node, N_NORMAL_MEMORY))
+ tmp = -1;
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
+ BUG_ON(!rtpn);
+
+ soft_limit_tree.rb_tree_per_node[node] = rtpn;
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ rtpz = &rtpn->rb_tree_per_zone[zone];
+ rtpz->rb_root = RB_ROOT;
+ spin_lock_init(&rtpz->lock);
+ }
+ }
+}
+
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -6031,7 +6519,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
- spin_lock_init(&memcg->soft_lock);
+ INIT_LIST_HEAD(&memcg->event_list);
+ spin_lock_init(&memcg->event_list_lock);
return &memcg->css;
@@ -6045,7 +6534,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
- int error = 0;
+
+ if (css->cgroup->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;
if (!parent)
return 0;
@@ -6077,10 +6568,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent != root_mem_cgroup)
mem_cgroup_subsys.broken_hierarchy = true;
}
-
- error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
- return error;
+
+ return memcg_init_kmem(memcg, &mem_cgroup_subsys);
}
/*
@@ -6104,18 +6594,32 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event, *tmp;
+ struct cgroup_subsys_state *iter;
+
+ /*
+ * Unregister events and notify userspace.
+ * Notify userspace about cgroup removing only after rmdir of cgroup
+ * directory to avoid race between userspace and kernelspace.
+ */
+ spin_lock(&memcg->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+ list_del_init(&event->list);
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
kmem_cgroup_css_offline(memcg);
mem_cgroup_invalidate_reclaim_iterators(memcg);
- mem_cgroup_reparent_charges(memcg);
- if (memcg->soft_contributed) {
- while ((memcg = parent_mem_cgroup(memcg)))
- atomic_dec(&memcg->children_in_excess);
- if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
- atomic_dec(&root_mem_cgroup->children_in_excess);
- }
+ /*
+ * This requires that offlining is serialized. Right now that is
+ * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+ */
+ css_for_each_descendant_post(iter, css)
+ mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
+
mem_cgroup_destroy_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
@@ -6123,6 +6627,42 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * XXX: css_offline() would be where we should reparent all
+ * memory to prepare the cgroup for destruction. However,
+ * memcg does not do css_tryget() and res_counter charging
+ * under the same RCU lock region, which means that charging
+ * could race with offlining. Offlining only happens to
+ * cgroups with no tasks in them but charges can show up
+ * without any tasks from the swapin path when the target
+ * memcg is looked up from the swapout record and not from the
+ * current task as it usually is. A race like this can leak
+ * charges and put pages with stale cgroup pointers into
+ * circulation:
+ *
+ * #0 #1
+ * lookup_swap_cgroup_id()
+ * rcu_read_lock()
+ * mem_cgroup_lookup()
+ * css_tryget()
+ * rcu_read_unlock()
+ * disable css_tryget()
+ * call_rcu()
+ * offline_css()
+ * reparent_charges()
+ * res_counter_charge()
+ * css_put()
+ * css_free()
+ * pc->mem_cgroup = dead memcg
+ * add page to lru
+ *
+ * The bulk of the charges are still moved in offline_css() to
+ * avoid pinning a lot of pages in case a long-term reference
+ * like a swapout record is deferring the css_free() to long
+ * after offlining. But this makes sure we catch any charges
+ * made after offlining:
+ */
+ mem_cgroup_reparent_charges(memcg);
memcg_destroy_kmem(memcg);
__mem_cgroup_free(memcg);
@@ -6325,7 +6865,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
- css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
+ mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
@@ -6347,7 +6887,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
enum mc_target_type ret = MC_TARGET_NONE;
page = pmd_page(pmd);
- VM_BUG_ON(!page || !PageHead(page));
+ VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!move_anon())
return ret;
pc = lookup_page_cgroup(page);
@@ -6376,10 +6916,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -6568,9 +7108,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6587,7 +7127,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -6745,7 +7285,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
.bind = mem_cgroup_bind,
.base_cftypes = mem_cgroup_files,
.early_init = 0,
- .use_id = 1,
};
#ifdef CONFIG_MEMCG_SWAP
@@ -6790,6 +7329,7 @@ static int __init mem_cgroup_init(void)
{
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
enable_swap_cgroup();
+ mem_cgroup_soft_limit_tree_init();
memcg_stock_init();
return 0;
}