From bd845e38c7a7251a95a8f2c38aa7fb87140b771d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:01 -0800 Subject: memcg: mm_match_cgroup not vm_match_cgroup vm_match_cgroup is a perverse name for a macro to match mm with cgroup: rename it mm_match_cgroup, matching mm_init_cgroup and mm_free_cgroup. Signed-off-by: Hugh Dickins Acked-by: David Rientjes Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 631002d085d..41041c0a689 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -399,7 +399,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) int ret; task_lock(task); - ret = task->mm && vm_match_cgroup(task->mm, mem); + ret = task->mm && mm_match_cgroup(task->mm, mem); task_unlock(task); return ret; } -- cgit v1.2.3-70-g09d2 From 427d5416f317681498337ab19218d195edea02d6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:03 -0800 Subject: memcg: move_lists on page not page_cgroup Each caller of mem_cgroup_move_lists is having to use page_get_page_cgroup: it's more convenient if it acts upon the page itself not the page_cgroup; and in a later patch this becomes important to handle within memcontrol.c. Signed-off-by: Hugh Dickins Cc: David Rientjes Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- mm/memcontrol.c | 4 +++- mm/swap.c | 2 +- mm/vmscan.c | 5 +++-- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e4247c83c1c..56432ff8d4e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -36,7 +36,7 @@ extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); extern void mem_cgroup_uncharge(struct page_cgroup *pc); extern void mem_cgroup_uncharge_page(struct page *page); -extern void mem_cgroup_move_lists(struct page_cgroup *pc, bool active); +extern void mem_cgroup_move_lists(struct page *page, bool active); extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -106,8 +106,7 @@ static inline void mem_cgroup_uncharge_page(struct page *page) { } -static inline void mem_cgroup_move_lists(struct page_cgroup *pc, - bool active) +static inline void mem_cgroup_move_lists(struct page *page, bool active) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 41041c0a689..afdd406f618 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -407,11 +407,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) /* * This routine assumes that the appropriate zone's lru lock is already held */ -void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) +void mem_cgroup_move_lists(struct page *page, bool active) { + struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; unsigned long flags; + pc = page_get_page_cgroup(page); if (!pc) return; diff --git a/mm/swap.c b/mm/swap.c index 710a20bb974..d4ec59aa5c4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -176,7 +176,7 @@ void activate_page(struct page *page) SetPageActive(page); add_page_to_active_list(zone, page); __count_vm_event(PGACTIVATE); - mem_cgroup_move_lists(page_get_page_cgroup(page), true); + mem_cgroup_move_lists(page, true); } spin_unlock_irq(&zone->lru_lock); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 106ba10e1ac..45711585684 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1128,7 +1128,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, ClearPageActive(page); list_move(&page->lru, &zone->inactive_list); - mem_cgroup_move_lists(page_get_page_cgroup(page), false); + mem_cgroup_move_lists(page, false); pgmoved++; if (!pagevec_add(&pvec, page)) { __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); @@ -1156,8 +1156,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, VM_BUG_ON(PageLRU(page)); SetPageLRU(page); VM_BUG_ON(!PageActive(page)); + list_move(&page->lru, &zone->active_list); - mem_cgroup_move_lists(page_get_page_cgroup(page), true); + mem_cgroup_move_lists(page, true); pgmoved++; if (!pagevec_add(&pvec, page)) { __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); -- cgit v1.2.3-70-g09d2 From 9442ec9df40d952b0de185ae5638a74970388e01 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:07 -0800 Subject: memcg: bad page if page_cgroup when free Replace free_hot_cold_page's VM_BUG_ON(page_get_page_cgroup(page)) by a "Bad page state" and clear: most users don't have CONFIG_DEBUG_VM on, and if it were set here, it'd likely cause corruption when the page is reused. Don't use page_assign_page_cgroup to clear it: that should be private to memcontrol.c, and always called with the lock taken; and memmap_init_zone doesn't need it either - like page->mapping and other pointers throughout the kernel, Linux assumes pointers in zeroed structures are NULL pointers. Instead use page_reset_bad_cgroup, added to memcontrol.h for this only. Signed-off-by: Hugh Dickins Cc: David Rientjes Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ++++---- mm/memcontrol.c | 27 ++++++++++++--------------- mm/page_alloc.c | 18 ++++++++++++------ 3 files changed, 28 insertions(+), 25 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 56432ff8d4e..70789df7dab 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,8 +29,9 @@ struct mm_struct; extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p); extern void mm_free_cgroup(struct mm_struct *mm); -extern void page_assign_page_cgroup(struct page *page, - struct page_cgroup *pc); + +#define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0) + extern struct page_cgroup *page_get_page_cgroup(struct page *page); extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); @@ -82,8 +83,7 @@ static inline void mm_free_cgroup(struct mm_struct *mm) { } -static inline void page_assign_page_cgroup(struct page *page, - struct page_cgroup *pc) +static inline void page_reset_bad_cgroup(struct page *page) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index afdd406f618..9e170d3c71e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -140,11 +140,17 @@ struct mem_cgroup { /* * We use the lower bit of the page->page_cgroup pointer as a bit spin - * lock. We need to ensure that page->page_cgroup is atleast two - * byte aligned (based on comments from Nick Piggin) + * lock. We need to ensure that page->page_cgroup is at least two + * byte aligned (based on comments from Nick Piggin). But since + * bit_spin_lock doesn't actually set that lock bit in a non-debug + * uniprocessor kernel, we should avoid setting it here too. */ #define PAGE_CGROUP_LOCK_BIT 0x0 -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) +#else +#define PAGE_CGROUP_LOCK 0x0 +#endif /* * A page_cgroup page is associated with every page descriptor. The @@ -271,19 +277,10 @@ static inline int page_cgroup_locked(struct page *page) &page->page_cgroup); } -void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) +static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) { - int locked; - - /* - * While resetting the page_cgroup we might not hold the - * page_cgroup lock. free_hot_cold_page() is an example - * of such a scenario - */ - if (pc) - VM_BUG_ON(!page_cgroup_locked(page)); - locked = (page->page_cgroup & PAGE_CGROUP_LOCK); - page->page_cgroup = ((unsigned long)pc | locked); + VM_BUG_ON(!page_cgroup_locked(page)); + page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); } struct page_cgroup *page_get_page_cgroup(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e76cf94725c..402a504f122 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -222,13 +222,19 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { - printk(KERN_EMERG "Bad page state in process '%s'\n" - KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" - KERN_EMERG "Trying to fix it up, but a reboot is needed\n" - KERN_EMERG "Backtrace:\n", + void *pc = page_get_page_cgroup(page); + + printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG + "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", current->comm, page, (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); + if (pc) { + printk(KERN_EMERG "cgroup:%p\n", pc); + page_reset_bad_cgroup(page); + } + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" + KERN_EMERG "Backtrace:\n"); dump_stack(); page->flags &= ~(1 << PG_lru | 1 << PG_private | @@ -454,6 +460,7 @@ static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | + (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | @@ -603,6 +610,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | + (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | @@ -989,7 +997,6 @@ static void free_hot_cold_page(struct page *page, int cold) if (!PageHighMem(page)) debug_check_no_locks_freed(page_address(page), PAGE_SIZE); - VM_BUG_ON(page_get_page_cgroup(page)); arch_free_page(page, 0); kernel_map_pages(page, 1, 0); @@ -2528,7 +2535,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, set_page_links(page, zone, nid, pfn); init_page_count(page); reset_page_mapcount(page); - page_assign_page_cgroup(page, NULL); SetPageReserved(page); /* -- cgit v1.2.3-70-g09d2 From 7e924aafa4b03ff71de34af8553d9a1ebc86c071 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:08 -0800 Subject: memcg: mem_cgroup_charge never NULL My memcgroup patch to fix hang with shmem/tmpfs added NULL page handling to mem_cgroup_charge_common. It seemed convenient at the time, but hard to justify now: there's a perfectly appropriate swappage to charge and uncharge instead, this is not on any hot path through shmem_getpage, and no performance hit was observed from the slight extra overhead. So revert that NULL page handling from mem_cgroup_charge_common; and make it clearer by bringing page_cgroup_assign_new_page_cgroup into its body - that was a helper I found more of a hindrance to understanding. Signed-off-by: Hugh Dickins Cc: David Rientjes Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 61 ++++++++++++++++++++------------------------------------- mm/shmem.c | 9 ++++++--- 2 files changed, 27 insertions(+), 43 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e170d3c71e..83ba13ad31e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -300,25 +300,6 @@ static void __always_inline unlock_page_cgroup(struct page *page) bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); } -/* - * Tie new page_cgroup to struct page under lock_page_cgroup() - * This can fail if the page has been tied to a page_cgroup. - * If success, returns 0. - */ -static int page_cgroup_assign_new_page_cgroup(struct page *page, - struct page_cgroup *pc) -{ - int ret = 0; - - lock_page_cgroup(page); - if (!page_get_page_cgroup(page)) - page_assign_page_cgroup(page, pc); - else /* A page is tied to other pc. */ - ret = 1; - unlock_page_cgroup(page); - return ret; -} - /* * Clear page->page_cgroup member under lock_page_cgroup(). * If given "pc" value is different from one page->page_cgroup, @@ -585,26 +566,24 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, * with it */ retry: - if (page) { - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - /* - * The page_cgroup exists and - * the page has already been accounted. - */ - if (pc) { - if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { - /* this page is under being uncharged ? */ - unlock_page_cgroup(page); - cpu_relax(); - goto retry; - } else { - unlock_page_cgroup(page); - goto done; - } + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); + /* + * The page_cgroup exists and + * the page has already been accounted. + */ + if (pc) { + if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { + /* this page is under being uncharged ? */ + unlock_page_cgroup(page); + cpu_relax(); + goto retry; + } else { + unlock_page_cgroup(page); + goto done; } - unlock_page_cgroup(page); } + unlock_page_cgroup(page); pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); if (pc == NULL) @@ -663,7 +642,9 @@ retry: if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) pc->flags |= PAGE_CGROUP_FLAG_CACHE; - if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) { + lock_page_cgroup(page); + if (page_get_page_cgroup(page)) { + unlock_page_cgroup(page); /* * Another charge has been added to this page already. * We take lock_page_cgroup(page) again and read @@ -672,10 +653,10 @@ retry: res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); kfree(pc); - if (!page) - goto done; goto retry; } + page_assign_page_cgroup(page, pc); + unlock_page_cgroup(page); mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); diff --git a/mm/shmem.c b/mm/shmem.c index 90b576cbc06..3372bc579e8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1370,14 +1370,17 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); unlock_page(swappage); - page_cache_release(swappage); if (error == -ENOMEM) { /* allow reclaim from this memory cgroup */ - error = mem_cgroup_cache_charge(NULL, + error = mem_cgroup_cache_charge(swappage, current->mm, gfp & ~__GFP_HIGHMEM); - if (error) + if (error) { + page_cache_release(swappage); goto failed; + } + mem_cgroup_uncharge_page(swappage); } + page_cache_release(swappage); goto repeat; } } else if (sgp == SGP_READ && !filepage) { -- cgit v1.2.3-70-g09d2 From 8289546e573d5ff681cdf0fc7a1184cca66fdb55 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:08 -0800 Subject: memcg: remove mem_cgroup_uncharge Nothing uses mem_cgroup_uncharge apart from mem_cgroup_uncharge_page, (a trivial wrapper around it) and mem_cgroup_end_migration (which does the same as mem_cgroup_uncharge_page). And it often ends up having to lock just to let its caller unlock. Remove it (but leave the silly locking until a later patch). Moved mem_cgroup_cache_charge next to mem_cgroup_charge in memcontrol.h. Signed-off-by: Hugh Dickins Cc: David Rientjes Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 20 +++++++------------- mm/memcontrol.c | 23 ++++++++--------------- 2 files changed, 15 insertions(+), 28 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 70789df7dab..8b1c4295848 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,7 +35,8 @@ extern void mm_free_cgroup(struct mm_struct *mm); extern struct page_cgroup *page_get_page_cgroup(struct page *page); extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); -extern void mem_cgroup_uncharge(struct page_cgroup *pc); +extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask); extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_move_lists(struct page *page, bool active); extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, @@ -45,8 +46,6 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct mem_cgroup *mem_cont, int active); extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); -extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); #define mm_match_cgroup(mm, cgroup) \ @@ -92,14 +91,16 @@ static inline struct page_cgroup *page_get_page_cgroup(struct page *page) return NULL; } -static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +static inline int mem_cgroup_charge(struct page *page, + struct mm_struct *mm, gfp_t gfp_mask) { return 0; } -static inline void mem_cgroup_uncharge(struct page_cgroup *pc) +static inline int mem_cgroup_cache_charge(struct page *page, + struct mm_struct *mm, gfp_t gfp_mask) { + return 0; } static inline void mem_cgroup_uncharge_page(struct page *page) @@ -110,13 +111,6 @@ static inline void mem_cgroup_move_lists(struct page *page, bool active) { } -static inline int mem_cgroup_cache_charge(struct page *page, - struct mm_struct *mm, - gfp_t gfp_mask) -{ - return 0; -} - static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) { return 1; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 83ba13ad31e..1333d25163b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -697,20 +697,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, /* * Uncharging is always a welcome operation, we never complain, simply - * uncharge. This routine should be called with lock_page_cgroup held + * uncharge. */ -void mem_cgroup_uncharge(struct page_cgroup *pc) +void mem_cgroup_uncharge_page(struct page *page) { + struct page_cgroup *pc; struct mem_cgroup *mem; struct mem_cgroup_per_zone *mz; - struct page *page; unsigned long flags; /* * Check if our page_cgroup is valid */ + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); if (!pc) - return; + goto unlock; if (atomic_dec_and_test(&pc->ref_cnt)) { page = pc->page; @@ -731,12 +733,8 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) } lock_page_cgroup(page); } -} -void mem_cgroup_uncharge_page(struct page *page) -{ - lock_page_cgroup(page); - mem_cgroup_uncharge(page_get_page_cgroup(page)); +unlock: unlock_page_cgroup(page); } @@ -759,12 +757,7 @@ int mem_cgroup_prepare_migration(struct page *page) void mem_cgroup_end_migration(struct page *page) { - struct page_cgroup *pc; - - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - mem_cgroup_uncharge(pc); - unlock_page_cgroup(page); + mem_cgroup_uncharge_page(page); } /* * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. -- cgit v1.2.3-70-g09d2 From 8869b8f6e09a1b49bf915eb03f663f2e4e8fbcd4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:09 -0800 Subject: memcg: memcontrol whitespace cleanups Sorry, before getting down to more important changes, I'd like to do some cleanup in memcontrol.c. This patch doesn't change the code generated, but cleans up whitespace, moves up a double declaration, removes an unused enum, removes void returns, removes misleading comments, that kind of thing. Signed-off-by: Hugh Dickins Cc: David Rientjes Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 94 +++++++++++++++++---------------------------------------- 1 file changed, 28 insertions(+), 66 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1333d25163b..31ab2c014fa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -137,6 +137,7 @@ struct mem_cgroup { */ struct mem_cgroup_stat stat; }; +static struct mem_cgroup init_mem_cgroup; /* * We use the lower bit of the page->page_cgroup pointer as a bit spin @@ -162,7 +163,7 @@ struct page_cgroup { struct mem_cgroup *mem_cgroup; atomic_t ref_cnt; /* Helpful when pages move b/w */ /* mapped and cached states */ - int flags; + int flags; }; #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ @@ -177,20 +178,11 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) return page_zonenum(pc->page); } -enum { - MEM_CGROUP_TYPE_UNSPEC = 0, - MEM_CGROUP_TYPE_MAPPED, - MEM_CGROUP_TYPE_CACHED, - MEM_CGROUP_TYPE_ALL, - MEM_CGROUP_TYPE_MAX, -}; - enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, }; - /* * Always modified under lru lock. Then, not necessary to preempt_disable() */ @@ -199,11 +191,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, { int val = (charge)? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; - VM_BUG_ON(!irqs_disabled()); + VM_BUG_ON(!irqs_disabled()); if (flags & PAGE_CGROUP_FLAG_CACHE) - __mem_cgroup_stat_add_safe(stat, - MEM_CGROUP_STAT_CACHE, val); + __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); else __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); } @@ -240,8 +231,6 @@ static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, return total; } -static struct mem_cgroup init_mem_cgroup; - static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { @@ -273,8 +262,7 @@ void mm_free_cgroup(struct mm_struct *mm) static inline int page_cgroup_locked(struct page *page) { - return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, - &page->page_cgroup); + return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); } static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) @@ -285,8 +273,7 @@ static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) struct page_cgroup *page_get_page_cgroup(struct page *page) { - return (struct page_cgroup *) - (page->page_cgroup & ~PAGE_CGROUP_LOCK); + return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); } static void __always_inline lock_page_cgroup(struct page *page) @@ -308,7 +295,6 @@ static void __always_inline unlock_page_cgroup(struct page *page) * A can can detect failure of clearing by following * clear_page_cgroup(page, pc) == pc */ - static struct page_cgroup *clear_page_cgroup(struct page *page, struct page_cgroup *pc) { @@ -417,6 +403,7 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); return (int)((rss * 100L) / total); } + /* * This function is called from vmscan.c. In page reclaiming loop. balance * between active and inactive list is calculated. For memory controller @@ -480,7 +467,6 @@ long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); - return (nr_inactive >> priority); } @@ -601,16 +587,11 @@ retry: rcu_read_lock(); mem = rcu_dereference(mm->mem_cgroup); /* - * For every charge from the cgroup, increment reference - * count + * For every charge from the cgroup, increment reference count */ css_get(&mem->css); rcu_read_unlock(); - /* - * If we created the page_cgroup, we should free it on exceeding - * the cgroup limit. - */ while (res_counter_charge(&mem->res, PAGE_SIZE)) { if (!(gfp_mask & __GFP_WAIT)) goto out; @@ -619,12 +600,12 @@ retry: continue; /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up - */ + * try_to_free_mem_cgroup_pages() might not give us a full + * picture of reclaim. Some pages are reclaimed and might be + * moved to swap cache or just unmapped from the cgroup. + * Check the limit again to see if the reclaim reduced the + * current usage of the cgroup before giving up + */ if (res_counter_check_under_limit(&mem->res)) continue; @@ -660,7 +641,6 @@ retry: mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); - /* Update statistics vector */ __mem_cgroup_add_list(pc); spin_unlock_irqrestore(&mz->lru_lock, flags); @@ -673,26 +653,19 @@ err: return -ENOMEM; } -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_MAPPED); } -/* - * See if the cached pages should be charged at all? - */ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - int ret = 0; if (!mm) mm = &init_mm; - - ret = mem_cgroup_charge_common(page, mm, gfp_mask, + return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE); - return ret; } /* @@ -742,11 +715,11 @@ unlock: * Returns non-zero if a page (under migration) has valid page_cgroup member. * Refcnt of page_cgroup is incremented. */ - int mem_cgroup_prepare_migration(struct page *page) { struct page_cgroup *pc; int ret = 0; + lock_page_cgroup(page); pc = page_get_page_cgroup(page); if (pc && atomic_inc_not_zero(&pc->ref_cnt)) @@ -759,28 +732,30 @@ void mem_cgroup_end_migration(struct page *page) { mem_cgroup_uncharge_page(page); } + /* - * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. + * We know both *page* and *newpage* are now not-on-LRU and PG_locked. * And no race with uncharge() routines because page_cgroup for *page* * has extra one reference by mem_cgroup_prepare_migration. */ - void mem_cgroup_page_migration(struct page *page, struct page *newpage) { struct page_cgroup *pc; struct mem_cgroup *mem; unsigned long flags; struct mem_cgroup_per_zone *mz; + retry: pc = page_get_page_cgroup(page); if (!pc) return; + mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc); if (clear_page_cgroup(page, pc) != pc) goto retry; - spin_lock_irqsave(&mz->lru_lock, flags); + spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_remove_list(pc); spin_unlock_irqrestore(&mz->lru_lock, flags); @@ -793,7 +768,6 @@ retry: spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_add_list(pc); spin_unlock_irqrestore(&mz->lru_lock, flags); - return; } /* @@ -802,8 +776,7 @@ retry: * *And* this routine doesn't reclaim page itself, just removes page_cgroup. */ #define FORCE_UNCHARGE_BATCH (128) -static void -mem_cgroup_force_empty_list(struct mem_cgroup *mem, +static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct mem_cgroup_per_zone *mz, int active) { @@ -837,27 +810,27 @@ retry: } else /* being uncharged ? ...do relax */ break; } + spin_unlock_irqrestore(&mz->lru_lock, flags); if (!list_empty(list)) { cond_resched(); goto retry; } - return; } /* * make mem_cgroup's charge to be 0 if there is no task. * This enables deleting this mem_cgroup. */ - int mem_cgroup_force_empty(struct mem_cgroup *mem) { int ret = -EBUSY; int node, zid; + css_get(&mem->css); /* * page reclaim code (kswapd etc..) will move pages between -` * active_list <-> inactive_list while we don't take a lock. + * active_list <-> inactive_list while we don't take a lock. * So, we have to do loop here until all lists are empty. */ while (mem->res.usage > 0) { @@ -879,8 +852,6 @@ out: return ret; } - - int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) { *tmp = memparse(buf, &buf); @@ -918,8 +889,7 @@ static ssize_t mem_force_empty_write(struct cgroup *cont, size_t nbytes, loff_t *ppos) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - int ret; - ret = mem_cgroup_force_empty(mem); + int ret = mem_cgroup_force_empty(mem); if (!ret) ret = nbytes; return ret; @@ -928,7 +898,6 @@ static ssize_t mem_force_empty_write(struct cgroup *cont, /* * Note: This should be removed if cgroup supports write-only file. */ - static ssize_t mem_force_empty_read(struct cgroup *cont, struct cftype *cft, struct file *file, char __user *userbuf, @@ -937,7 +906,6 @@ static ssize_t mem_force_empty_read(struct cgroup *cont, return -EINVAL; } - static const struct mem_cgroup_stat_desc { const char *msg; u64 unit; @@ -990,8 +958,6 @@ static int mem_control_stat_open(struct inode *unused, struct file *file) return single_open(file, mem_control_stat_show, cont); } - - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -1057,9 +1023,6 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) kfree(mem->info.nodeinfo[node]); } - -static struct mem_cgroup init_mem_cgroup; - static struct cgroup_subsys_state * mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { @@ -1149,7 +1112,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, out: mmput(mm); - return; } struct cgroup_subsys mem_cgroup_subsys = { -- cgit v1.2.3-70-g09d2 From d5b69e38f8cdb1e41cc022305c86c9739bf1ffdb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:10 -0800 Subject: memcg: memcontrol uninlined and static More cleanup to memcontrol.c, this time changing some of the code generated. Let the compiler decide what to inline (except for page_cgroup_locked which is only used when CONFIG_DEBUG_VM): the __always_inline on lock_page_cgroup etc. was quite a waste since bit_spin_lock etc. are inlines in a header file; made mem_cgroup_force_empty and mem_cgroup_write_strategy static. Signed-off-by: Hugh Dickins Cc: David Rientjes Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 31ab2c014fa..a59f946c933 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -168,12 +168,12 @@ struct page_cgroup { #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ -static inline int page_cgroup_nid(struct page_cgroup *pc) +static int page_cgroup_nid(struct page_cgroup *pc) { return page_to_nid(pc->page); } -static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) +static enum zone_type page_cgroup_zid(struct page_cgroup *pc) { return page_zonenum(pc->page); } @@ -199,14 +199,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); } -static inline struct mem_cgroup_per_zone * +static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) { - BUG_ON(!mem->info.nodeinfo[nid]); return &mem->info.nodeinfo[nid]->zoneinfo[zid]; } -static inline struct mem_cgroup_per_zone * +static struct mem_cgroup_per_zone * page_cgroup_zoneinfo(struct page_cgroup *pc) { struct mem_cgroup *mem = pc->mem_cgroup; @@ -231,16 +230,14 @@ static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, return total; } -static inline -struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return container_of(cgroup_subsys_state(cont, mem_cgroup_subsys_id), struct mem_cgroup, css); } -static inline -struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) +static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css); @@ -276,13 +273,12 @@ struct page_cgroup *page_get_page_cgroup(struct page *page) return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); } -static void __always_inline lock_page_cgroup(struct page *page) +static void lock_page_cgroup(struct page *page) { bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); - VM_BUG_ON(!page_cgroup_locked(page)); } -static void __always_inline unlock_page_cgroup(struct page *page) +static void unlock_page_cgroup(struct page *page) { bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); } @@ -741,16 +737,14 @@ void mem_cgroup_end_migration(struct page *page) void mem_cgroup_page_migration(struct page *page, struct page *newpage) { struct page_cgroup *pc; - struct mem_cgroup *mem; - unsigned long flags; struct mem_cgroup_per_zone *mz; + unsigned long flags; retry: pc = page_get_page_cgroup(page); if (!pc) return; - mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc); if (clear_page_cgroup(page, pc) != pc) goto retry; @@ -822,7 +816,7 @@ retry: * make mem_cgroup's charge to be 0 if there is no task. * This enables deleting this mem_cgroup. */ -int mem_cgroup_force_empty(struct mem_cgroup *mem) +static int mem_cgroup_force_empty(struct mem_cgroup *mem) { int ret = -EBUSY; int node, zid; @@ -852,7 +846,7 @@ out: return ret; } -int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) +static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) { *tmp = memparse(buf, &buf); if (*buf != '\0') -- cgit v1.2.3-70-g09d2 From b9c565d5a29a795f970b4a1340393d8fc6722fb9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:11 -0800 Subject: memcg: remove clear_page_cgroup and atomics Remove clear_page_cgroup: it's an unhelpful helper, see for example how mem_cgroup_uncharge_page had to unlock_page_cgroup just in order to call it (serious races from that? I'm not sure). Once that's gone, you can see it's pointless for page_cgroup's ref_cnt to be atomic: it's always manipulated under lock_page_cgroup, except where force_empty unilaterally reset it to 0 (and how does uncharge's atomic_dec_and_test protect against that?). Simplify this page_cgroup locking: if you've got the lock and the pc is attached, then the ref_cnt must be positive: VM_BUG_ONs to check that, and to check that pc->page matches page (we're on the way to finding why sometimes it doesn't, but this patch doesn't fix that). Signed-off-by: Hugh Dickins Cc: David Rientjes Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 106 +++++++++++++++++++++++--------------------------------- 1 file changed, 43 insertions(+), 63 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a59f946c933..13e9e7d8e49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -161,8 +161,7 @@ struct page_cgroup { struct list_head lru; /* per cgroup LRU list */ struct page *page; struct mem_cgroup *mem_cgroup; - atomic_t ref_cnt; /* Helpful when pages move b/w */ - /* mapped and cached states */ + int ref_cnt; /* cached, mapped, migrating */ int flags; }; #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ @@ -283,27 +282,6 @@ static void unlock_page_cgroup(struct page *page) bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); } -/* - * Clear page->page_cgroup member under lock_page_cgroup(). - * If given "pc" value is different from one page->page_cgroup, - * page->cgroup is not cleared. - * Returns a value of page->page_cgroup at lock taken. - * A can can detect failure of clearing by following - * clear_page_cgroup(page, pc) == pc - */ -static struct page_cgroup *clear_page_cgroup(struct page *page, - struct page_cgroup *pc) -{ - struct page_cgroup *ret; - /* lock and clear */ - lock_page_cgroup(page); - ret = page_get_page_cgroup(page); - if (likely(ret == pc)) - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); - return ret; -} - static void __mem_cgroup_remove_list(struct page_cgroup *pc) { int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; @@ -555,15 +533,12 @@ retry: * the page has already been accounted. */ if (pc) { - if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { - /* this page is under being uncharged ? */ - unlock_page_cgroup(page); - cpu_relax(); - goto retry; - } else { - unlock_page_cgroup(page); - goto done; - } + VM_BUG_ON(pc->page != page); + VM_BUG_ON(pc->ref_cnt <= 0); + + pc->ref_cnt++; + unlock_page_cgroup(page); + goto done; } unlock_page_cgroup(page); @@ -612,7 +587,7 @@ retry: congestion_wait(WRITE, HZ/10); } - atomic_set(&pc->ref_cnt, 1); + pc->ref_cnt = 1; pc->mem_cgroup = mem; pc->page = page; pc->flags = PAGE_CGROUP_FLAG_ACTIVE; @@ -683,24 +658,24 @@ void mem_cgroup_uncharge_page(struct page *page) if (!pc) goto unlock; - if (atomic_dec_and_test(&pc->ref_cnt)) { - page = pc->page; - mz = page_cgroup_zoneinfo(pc); - /* - * get page->cgroup and clear it under lock. - * force_empty can drop page->cgroup without checking refcnt. - */ + VM_BUG_ON(pc->page != page); + VM_BUG_ON(pc->ref_cnt <= 0); + + if (--(pc->ref_cnt) == 0) { + page_assign_page_cgroup(page, NULL); unlock_page_cgroup(page); - if (clear_page_cgroup(page, pc) == pc) { - mem = pc->mem_cgroup; - css_put(&mem->css); - res_counter_uncharge(&mem->res, PAGE_SIZE); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); - kfree(pc); - } - lock_page_cgroup(page); + + mem = pc->mem_cgroup; + css_put(&mem->css); + res_counter_uncharge(&mem->res, PAGE_SIZE); + + mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); + __mem_cgroup_remove_list(pc); + spin_unlock_irqrestore(&mz->lru_lock, flags); + + kfree(pc); + return; } unlock: @@ -714,14 +689,13 @@ unlock: int mem_cgroup_prepare_migration(struct page *page) { struct page_cgroup *pc; - int ret = 0; lock_page_cgroup(page); pc = page_get_page_cgroup(page); - if (pc && atomic_inc_not_zero(&pc->ref_cnt)) - ret = 1; + if (pc) + pc->ref_cnt++; unlock_page_cgroup(page); - return ret; + return pc != NULL; } void mem_cgroup_end_migration(struct page *page) @@ -740,15 +714,17 @@ void mem_cgroup_page_migration(struct page *page, struct page *newpage) struct mem_cgroup_per_zone *mz; unsigned long flags; -retry: + lock_page_cgroup(page); pc = page_get_page_cgroup(page); - if (!pc) + if (!pc) { + unlock_page_cgroup(page); return; + } - mz = page_cgroup_zoneinfo(pc); - if (clear_page_cgroup(page, pc) != pc) - goto retry; + page_assign_page_cgroup(page, NULL); + unlock_page_cgroup(page); + mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_remove_list(pc); spin_unlock_irqrestore(&mz->lru_lock, flags); @@ -794,15 +770,19 @@ retry: while (--count && !list_empty(list)) { pc = list_entry(list->prev, struct page_cgroup, lru); page = pc->page; - /* Avoid race with charge */ - atomic_set(&pc->ref_cnt, 0); - if (clear_page_cgroup(page, pc) == pc) { + lock_page_cgroup(page); + if (page_get_page_cgroup(page) == pc) { + page_assign_page_cgroup(page, NULL); + unlock_page_cgroup(page); css_put(&mem->css); res_counter_uncharge(&mem->res, PAGE_SIZE); __mem_cgroup_remove_list(pc); kfree(pc); - } else /* being uncharged ? ...do relax */ + } else { + /* racing uncharge: let page go then retry */ + unlock_page_cgroup(page); break; + } } spin_unlock_irqrestore(&mz->lru_lock, flags); -- cgit v1.2.3-70-g09d2 From 6d48ff8bcfd403ec8d3ef7a56538ea9e6f773b9c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:12 -0800 Subject: memcg: css_put after remove_list mem_cgroup_uncharge_page does css_put on the mem_cgroup before uncharging from it, and before removing page_cgroup from one of its lru lists: isn't there a danger that struct mem_cgroup memory could be freed and reused before completing that, so corrupting something? Never seen it, and for all I know there may be other constraints which make it impossible; but let's be defensive and reverse the ordering there. mem_cgroup_force_empty_list is safe because there's an extra css_get around all its works; but even so, change its ordering the same way round, to help get in the habit of doing it like this. Signed-off-by: Hugh Dickins Cc: David Rientjes Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 13e9e7d8e49..66d0e84cefa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -665,15 +665,15 @@ void mem_cgroup_uncharge_page(struct page *page) page_assign_page_cgroup(page, NULL); unlock_page_cgroup(page); - mem = pc->mem_cgroup; - css_put(&mem->css); - res_counter_uncharge(&mem->res, PAGE_SIZE); - mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_remove_list(pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + mem = pc->mem_cgroup; + res_counter_uncharge(&mem->res, PAGE_SIZE); + css_put(&mem->css); + kfree(pc); return; } @@ -774,9 +774,9 @@ retry: if (page_get_page_cgroup(page) == pc) { page_assign_page_cgroup(page, NULL); unlock_page_cgroup(page); - css_put(&mem->css); - res_counter_uncharge(&mem->res, PAGE_SIZE); __mem_cgroup_remove_list(pc); + res_counter_uncharge(&mem->res, PAGE_SIZE); + css_put(&mem->css); kfree(pc); } else { /* racing uncharge: let page go then retry */ -- cgit v1.2.3-70-g09d2 From 2680eed723b664d83e6181ae275fac0ec8fa05ff Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:13 -0800 Subject: memcg: fix mem_cgroup_move_lists locking Ever since the VM_BUG_ON(page_get_page_cgroup(page)) (now Bad page state) went into page freeing, I've hit it from time to time in testing on some machines, sometimes only after many days. Recently found a machine which could usually produce it within a few hours, which got me there at last. The culprit is mem_cgroup_move_lists, whose locking is inadequate; and the arrangement of structures was such that you got page_cgroups from the lru list neatly put on to SLUB's freelist. Kamezawa-san identified the same hole independently. The main problem was that it was missing the lock_page_cgroup it needs to safely page_get_page_cgroup; but it's tricky to go beyond that too, and I couldn't do it with SLAB_DESTROY_BY_RCU as I'd expected. See the code for comments on the constraints. This patch immediately gets replaced by a simpler one from Hirokazu-san; but is it just foolish pride that tells me to put this one on record, in case we need to come back to it later? Signed-off-by: Hugh Dickins Cc: David Rientjes Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Hirokazu Takahashi Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 66d0e84cefa..dcbe30aad1d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -277,6 +277,11 @@ static void lock_page_cgroup(struct page *page) bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); } +static int try_lock_page_cgroup(struct page *page) +{ + return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); +} + static void unlock_page_cgroup(struct page *page) { bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); @@ -348,17 +353,49 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) void mem_cgroup_move_lists(struct page *page, bool active) { struct page_cgroup *pc; + struct mem_cgroup *mem; struct mem_cgroup_per_zone *mz; unsigned long flags; - pc = page_get_page_cgroup(page); - if (!pc) + /* + * We cannot lock_page_cgroup while holding zone's lru_lock, + * because other holders of lock_page_cgroup can be interrupted + * with an attempt to rotate_reclaimable_page. But we cannot + * safely get to page_cgroup without it, so just try_lock it: + * mem_cgroup_isolate_pages allows for page left on wrong list. + */ + if (!try_lock_page_cgroup(page)) return; - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_move_lists(pc, active); - spin_unlock_irqrestore(&mz->lru_lock, flags); + /* + * Now page_cgroup is stable, but we cannot acquire mz->lru_lock + * while holding it, because mem_cgroup_force_empty_list does the + * reverse. Get a hold on the mem_cgroup before unlocking, so that + * the zoneinfo remains stable, then take mz->lru_lock; then check + * that page still points to pc and pc (even if freed and reassigned + * to that same page meanwhile) still points to the same mem_cgroup. + * Then we know mz still points to the right spinlock, so it's safe + * to move_lists (page->page_cgroup might be reset while we do so, but + * that doesn't matter: pc->page is stable till we drop mz->lru_lock). + * We're being a little naughty not to try_lock_page_cgroup again + * inside there, but we are safe, aren't we? Aren't we? Whistle... + */ + pc = page_get_page_cgroup(page); + if (pc) { + mem = pc->mem_cgroup; + mz = page_cgroup_zoneinfo(pc); + css_get(&mem->css); + + unlock_page_cgroup(page); + + spin_lock_irqsave(&mz->lru_lock, flags); + if (page_get_page_cgroup(page) == pc && pc->mem_cgroup == mem) + __mem_cgroup_move_lists(pc, active); + spin_unlock_irqrestore(&mz->lru_lock, flags); + + css_put(&mem->css); + } else + unlock_page_cgroup(page); } /* -- cgit v1.2.3-70-g09d2 From 9b3c0a07e0fca35e36751680de3e4c76dbff5df3 Mon Sep 17 00:00:00 2001 From: Hirokazu Takahashi Date: Tue, 4 Mar 2008 14:29:15 -0800 Subject: memcg: simplify force_empty and move_lists As for force_empty, though this may not be the main topic here, mem_cgroup_force_empty_list() can be implemented simpler. It is possible to make the function just call mem_cgroup_uncharge_page() instead of releasing page_cgroups by itself. The tip is to call get_page() before invoking mem_cgroup_uncharge_page(), so the page won't be released during this function. Kamezawa-san points out that by the time mem_cgroup_uncharge_page() uncharges, the page might have been reassigned to an lru of a different mem_cgroup, and now be emptied from that; but Hugh claims that's okay, the end state is the same as when it hasn't gone to another list. And once force_empty stops taking lock_page_cgroup within mz->lru_lock, mem_cgroup_move_lists() can be simplified to take mz->lru_lock directly while holding page_cgroup lock (but still has to use try_lock_page_cgroup). Signed-off-by: Hirokazu Takahashi Signed-off-by: Hugh Dickins Cc: David Rientjes Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: YAMAMOTO Takashi Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 62 ++++++++++++--------------------------------------------- 1 file changed, 13 insertions(+), 49 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dcbe30aad1d..f72067094e0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -353,7 +353,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) void mem_cgroup_move_lists(struct page *page, bool active) { struct page_cgroup *pc; - struct mem_cgroup *mem; struct mem_cgroup_per_zone *mz; unsigned long flags; @@ -367,35 +366,14 @@ void mem_cgroup_move_lists(struct page *page, bool active) if (!try_lock_page_cgroup(page)) return; - /* - * Now page_cgroup is stable, but we cannot acquire mz->lru_lock - * while holding it, because mem_cgroup_force_empty_list does the - * reverse. Get a hold on the mem_cgroup before unlocking, so that - * the zoneinfo remains stable, then take mz->lru_lock; then check - * that page still points to pc and pc (even if freed and reassigned - * to that same page meanwhile) still points to the same mem_cgroup. - * Then we know mz still points to the right spinlock, so it's safe - * to move_lists (page->page_cgroup might be reset while we do so, but - * that doesn't matter: pc->page is stable till we drop mz->lru_lock). - * We're being a little naughty not to try_lock_page_cgroup again - * inside there, but we are safe, aren't we? Aren't we? Whistle... - */ pc = page_get_page_cgroup(page); if (pc) { - mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc); - css_get(&mem->css); - - unlock_page_cgroup(page); - spin_lock_irqsave(&mz->lru_lock, flags); - if (page_get_page_cgroup(page) == pc && pc->mem_cgroup == mem) - __mem_cgroup_move_lists(pc, active); + __mem_cgroup_move_lists(pc, active); spin_unlock_irqrestore(&mz->lru_lock, flags); - - css_put(&mem->css); - } else - unlock_page_cgroup(page); + } + unlock_page_cgroup(page); } /* @@ -789,7 +767,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, { struct page_cgroup *pc; struct page *page; - int count; + int count = FORCE_UNCHARGE_BATCH; unsigned long flags; struct list_head *list; @@ -798,35 +776,21 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, else list = &mz->inactive_list; - if (list_empty(list)) - return; -retry: - count = FORCE_UNCHARGE_BATCH; spin_lock_irqsave(&mz->lru_lock, flags); - - while (--count && !list_empty(list)) { + while (!list_empty(list)) { pc = list_entry(list->prev, struct page_cgroup, lru); page = pc->page; - lock_page_cgroup(page); - if (page_get_page_cgroup(page) == pc) { - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); - __mem_cgroup_remove_list(pc); - res_counter_uncharge(&mem->res, PAGE_SIZE); - css_put(&mem->css); - kfree(pc); - } else { - /* racing uncharge: let page go then retry */ - unlock_page_cgroup(page); - break; + get_page(page); + spin_unlock_irqrestore(&mz->lru_lock, flags); + mem_cgroup_uncharge_page(page); + put_page(page); + if (--count <= 0) { + count = FORCE_UNCHARGE_BATCH; + cond_resched(); } + spin_lock_irqsave(&mz->lru_lock, flags); } - spin_unlock_irqrestore(&mz->lru_lock, flags); - if (!list_empty(list)) { - cond_resched(); - goto retry; - } } /* -- cgit v1.2.3-70-g09d2 From fb59e9f1e9786635ea12e12bf6adbb132e10f979 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 Mar 2008 14:29:16 -0800 Subject: memcg: fix oops on NULL lru list While testing force_empty, during an exit_mmap, __mem_cgroup_remove_list called from mem_cgroup_uncharge_page oopsed on a NULL pointer in the lru list. I couldn't see what racing tasks on other cpus were doing, but surmise that another must have been in mem_cgroup_charge_common on the same page, between its unlock_page_cgroup and spin_lock_irqsave near done (thanks to that kzalloc which I'd almost changed to a kmalloc). Normally such a race cannot happen, the ref_cnt prevents it, the final uncharge cannot race with the initial charge. But force_empty buggers the ref_cnt, that's what it's all about; and thereafter forced pages are vulnerable to races such as this (just think of a shared page also mapped into an mm of another mem_cgroup than that just emptied). And remain vulnerable until they're freed indefinitely later. This patch just fixes the oops by moving the unlock_page_cgroups down below adding to and removing from the list (only possible given the previous patch); and while we're at it, we might as well make it an invariant that page->page_cgroup is always set while pc is on lru. But this behaviour of force_empty seems highly unsatisfactory to me: why have a ref_cnt if we always have to cope with it being violated (as in the earlier page migration patch). We may prefer force_empty to move pages to an orphan mem_cgroup (could be the root, but better not), from which other cgroups could recover them; we might need to reverse the locking again; but no time now for such concerns. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f72067094e0..8b9f6cae938 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -623,13 +623,13 @@ retry: goto retry; } page_assign_page_cgroup(page, pc); - unlock_page_cgroup(page); mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_add_list(pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + unlock_page_cgroup(page); done: return 0; out: @@ -677,14 +677,14 @@ void mem_cgroup_uncharge_page(struct page *page) VM_BUG_ON(pc->ref_cnt <= 0); if (--(pc->ref_cnt) == 0) { - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); - mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_remove_list(pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + page_assign_page_cgroup(page, NULL); + unlock_page_cgroup(page); + mem = pc->mem_cgroup; res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); @@ -736,23 +736,24 @@ void mem_cgroup_page_migration(struct page *page, struct page *newpage) return; } - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); - mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_remove_list(pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + page_assign_page_cgroup(page, NULL); + unlock_page_cgroup(page); + pc->page = newpage; lock_page_cgroup(newpage); page_assign_page_cgroup(newpage, pc); - unlock_page_cgroup(newpage); mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_add_list(pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + + unlock_page_cgroup(newpage); } /* -- cgit v1.2.3-70-g09d2