diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 166 |
1 files changed, 163 insertions, 3 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d4805eb37c..ebca767292d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -21,6 +21,9 @@ #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/bit_spinlock.h> +#include <linux/rcupdate.h> struct cgroup_subsys mem_cgroup_subsys; @@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys; * to help the administrator determine what knobs to tune. * * TODO: Add a water mark for the memory controller. Reclaim will begin when - * we hit the water mark. + * we hit the water mark. May be even add a low water mark, such that + * no reclaim occurs from a cgroup at it's low water mark, this is + * a feature that will be implemented much later in the future. */ struct mem_cgroup { struct cgroup_subsys_state css; @@ -49,6 +54,14 @@ struct mem_cgroup { }; /* + * We use the lower bit of the page->page_cgroup pointer as a bit spin + * lock. We need to ensure that page->page_cgroup is atleast two + * byte aligned (based on comments from Nick Piggin) + */ +#define PAGE_CGROUP_LOCK_BIT 0x0 +#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) + +/* * A page_cgroup page is associated with every page descriptor. The * page_cgroup helps us identify information about the cgroup */ @@ -56,6 +69,8 @@ struct page_cgroup { struct list_head lru; /* per cgroup LRU list */ struct page *page; struct mem_cgroup *mem_cgroup; + atomic_t ref_cnt; /* Helpful when pages move b/w */ + /* mapped and cached states */ }; @@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm) css_put(&mm->mem_cgroup->css); } +static inline int page_cgroup_locked(struct page *page) +{ + return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, + &page->page_cgroup); +} + void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) { - page->page_cgroup = (unsigned long)pc; + int locked; + + /* + * While resetting the page_cgroup we might not hold the + * page_cgroup lock. free_hot_cold_page() is an example + * of such a scenario + */ + if (pc) + VM_BUG_ON(!page_cgroup_locked(page)); + locked = (page->page_cgroup & PAGE_CGROUP_LOCK); + page->page_cgroup = ((unsigned long)pc | locked); } struct page_cgroup *page_get_page_cgroup(struct page *page) { - return page->page_cgroup; + return (struct page_cgroup *) + (page->page_cgroup & ~PAGE_CGROUP_LOCK); +} + +void __always_inline lock_page_cgroup(struct page *page) +{ + bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); + VM_BUG_ON(!page_cgroup_locked(page)); +} + +void __always_inline unlock_page_cgroup(struct page *page) +{ + bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); +} + +/* + * Charge the memory controller for page usage. + * Return + * 0 if the charge was successful + * < 0 if the cgroup is over its limit + */ +int mem_cgroup_charge(struct page *page, struct mm_struct *mm) +{ + struct mem_cgroup *mem; + struct page_cgroup *pc, *race_pc; + + /* + * Should page_cgroup's go to their own slab? + * One could optimize the performance of the charging routine + * by saving a bit in the page_flags and using it as a lock + * to see if the cgroup page already has a page_cgroup associated + * with it + */ + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); + /* + * The page_cgroup exists and the page has already been accounted + */ + if (pc) { + atomic_inc(&pc->ref_cnt); + goto done; + } + + unlock_page_cgroup(page); + + pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL); + if (pc == NULL) + goto err; + + rcu_read_lock(); + /* + * We always charge the cgroup the mm_struct belongs to + * the mm_struct's mem_cgroup changes on task migration if the + * thread group leader migrates. It's possible that mm is not + * set, if so charge the init_mm (happens for pagecache usage). + */ + if (!mm) + mm = &init_mm; + + mem = rcu_dereference(mm->mem_cgroup); + /* + * For every charge from the cgroup, increment reference + * count + */ + css_get(&mem->css); + rcu_read_unlock(); + + /* + * If we created the page_cgroup, we should free it on exceeding + * the cgroup limit. + */ + if (res_counter_charge(&mem->res, 1)) { + css_put(&mem->css); + goto free_pc; + } + + lock_page_cgroup(page); + /* + * Check if somebody else beat us to allocating the page_cgroup + */ + race_pc = page_get_page_cgroup(page); + if (race_pc) { + kfree(pc); + pc = race_pc; + atomic_inc(&pc->ref_cnt); + res_counter_uncharge(&mem->res, 1); + css_put(&mem->css); + goto done; + } + + atomic_set(&pc->ref_cnt, 1); + pc->mem_cgroup = mem; + pc->page = page; + page_assign_page_cgroup(page, pc); + +done: + unlock_page_cgroup(page); + return 0; +free_pc: + kfree(pc); + return -ENOMEM; +err: + unlock_page_cgroup(page); + return -ENOMEM; +} + +/* + * Uncharging is always a welcome operation, we never complain, simply + * uncharge. + */ +void mem_cgroup_uncharge(struct page_cgroup *pc) +{ + struct mem_cgroup *mem; + struct page *page; + + if (!pc) + return; + + if (atomic_dec_and_test(&pc->ref_cnt)) { + page = pc->page; + lock_page_cgroup(page); + mem = pc->mem_cgroup; + css_put(&mem->css); + page_assign_page_cgroup(page, NULL); + unlock_page_cgroup(page); + res_counter_uncharge(&mem->res, 1); + kfree(pc); + } } static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, @@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return NULL; res_counter_init(&mem->res); + INIT_LIST_HEAD(&mem->active_list); + INIT_LIST_HEAD(&mem->inactive_list); return &mem->css; } |