diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 84 | ||||
-rw-r--r-- | mm/memcontrol.c | 630 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 75 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mlock.c | 4 | ||||
-rw-r--r-- | mm/nobootmem.c | 8 | ||||
-rw-r--r-- | mm/nommu.c | 6 | ||||
-rw-r--r-- | mm/oom_kill.c | 13 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 10 | ||||
-rw-r--r-- | mm/page_cgroup.c | 131 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 18 | ||||
-rw-r--r-- | mm/rmap.c | 5 | ||||
-rw-r--r-- | mm/shmem.c | 1 | ||||
-rw-r--r-- | mm/slub.c | 6 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 46 | ||||
-rw-r--r-- | mm/vmscan.c | 2 |
22 files changed, 573 insertions, 517 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d3022..0d9a036ada6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,17 +14,11 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} -EXPORT_SYMBOL(default_unplug_io_fn); - struct backing_dev_info default_backing_dev_info = { .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, - .unplug_io_fn = default_unplug_io_fn, }; EXPORT_SYMBOL_GPL(default_backing_dev_info); @@ -73,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct inode *inode; nr_wb = nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_bdi == bdi) - sb->s_bdi = NULL; + sb->s_bdi = &default_backing_dev_info; } spin_unlock(&sb_lock); } @@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } bdi_unregister(bdi); diff --git a/mm/bootmem.c b/mm/bootmem.c index 07aeb89e396..01d5a4b3dd0 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -34,14 +34,6 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -#ifdef CONFIG_CRASH_DUMP -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; -#endif - bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); diff --git a/mm/filemap.c b/mm/filemap.c index f807afda86f..c641edf553a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -80,8 +80,8 @@ * ->i_mutex * ->i_alloc_sem (various) * - * ->inode_lock - * ->sb_lock (fs/fs-writeback.c) + * inode_wb_list_lock + * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * * ->i_mmap_lock @@ -98,8 +98,10 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (zap_pte_range->set_page_dirty) + * inode_wb_list_lock (page_remove_rmap->set_page_dirty) + * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * inode_wb_list_lock (zap_pte_range->set_page_dirty) + * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * * (code doesn't rely on that order, so you could switch it around) @@ -164,45 +166,15 @@ void delete_from_page_cache(struct page *page) } EXPORT_SYMBOL(delete_from_page_cache); -static int sync_page(void *word) +static int sleep_on_page(void *word) { - struct address_space *mapping; - struct page *page; - - page = container_of((unsigned long *)word, struct page, flags); - - /* - * page_mapping() is being called without PG_locked held. - * Some knowledge of the state and use of the page is used to - * reduce the requirements down to a memory barrier. - * The danger here is of a stale page_mapping() return value - * indicating a struct address_space different from the one it's - * associated with when it is associated with one. - * After smp_mb(), it's either the correct page_mapping() for - * the page, or an old page_mapping() and the page's own - * page_mapping() has gone NULL. - * The ->sync_page() address_space operation must tolerate - * page_mapping() going NULL. By an amazing coincidence, - * this comes about because none of the users of the page - * in the ->sync_page() methods make essential use of the - * page_mapping(), merely passing the page down to the backing - * device's unplug functions when it's non-NULL, which in turn - * ignore it for all cases but swap, where only page_private(page) is - * of interest. When page_mapping() does go NULL, the entire - * call stack gracefully ignores the page and returns. - * -- wli - */ - smp_mb(); - mapping = page_mapping(page); - if (mapping && mapping->a_ops && mapping->a_ops->sync_page) - mapping->a_ops->sync_page(page); io_schedule(); return 0; } -static int sync_page_killable(void *word) +static int sleep_on_page_killable(void *word) { - sync_page(word); + sleep_on_page(word); return fatal_signal_pending(current) ? -EINTR : 0; } @@ -558,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp) EXPORT_SYMBOL(__page_cache_alloc); #endif -static int __sleep_on_page_lock(void *word) -{ - io_schedule(); - return 0; -} - /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -591,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr) DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sync_page, + __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); @@ -655,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback); /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock - * - * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some - * random driver's requestfn sets TASK_RUNNING, we could busywait. However - * chances are that on the second loop, the block layer's plug list is empty, - * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); @@ -675,24 +636,10 @@ int __lock_page_killable(struct page *page) DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); return __wait_on_bit_lock(page_waitqueue(page), &wait, - sync_page_killable, TASK_KILLABLE); + sleep_on_page_killable, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); -/** - * __lock_page_nosync - get a lock on the page, without calling sync_page() - * @page: the page to lock - * - * Variant of lock_page that does not require the caller to hold a reference - * on the page's mapping. - */ -void __lock_page_nosync(struct page *page) -{ - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, - TASK_UNINTERRUPTIBLE); -} - int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { @@ -1407,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; + struct blk_plug plug; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; + blk_start_plug(&plug); + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { loff_t size; @@ -1485,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, break; } out: + blk_finish_plug(&plug); return retval; } EXPORT_SYMBOL(generic_file_aio_read); @@ -2596,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); + blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2611,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } + blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(generic_file_aio_write); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e1ee6ad9c97..1f0b460fe58 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0; #define do_swap_account (0) #endif -/* - * Per memcg event counter is incremented at every pagein/pageout. This counter - * is used for trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - * - * These values will be used as !((event) & ((1 <<(thresh)) - 1)) - */ -#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ -#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ /* * Statistics for memory cgroup. @@ -93,19 +84,36 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ - MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ - /* incremented at every pagein/pageout */ - MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ - MEM_CGROUP_STAT_NSTATS, }; +enum mem_cgroup_events_index { + MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ + MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ + MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ + MEM_CGROUP_EVENTS_NSTATS, +}; +/* + * Per memcg event counter is incremented at every pagein/pageout. With THP, + * it will be incremated by the number of pages. This counter is used for + * for trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + */ +enum mem_cgroup_events_target { + MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_NTARGETS, +}; +#define THRESHOLDS_EVENTS_TARGET (128) +#define SOFTLIMIT_EVENTS_TARGET (1024) + struct mem_cgroup_stat_cpu { - s64 count[MEM_CGROUP_STAT_NSTATS]; + long count[MEM_CGROUP_STAT_NSTATS]; + unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long targets[MEM_CGROUP_NTARGETS]; }; /* @@ -218,12 +226,6 @@ struct mem_cgroup { * per zone LRU lists. */ struct mem_cgroup_lru_info info; - - /* - protect against reclaim related member. - */ - spinlock_t reclaim_param_lock; - /* * While reclaiming in a hierarchy, we cache the last child we * reclaimed from. @@ -327,13 +329,6 @@ enum charge_type { NR_CHARGE_TYPE, }; -/* only for here (for easy reading.) */ -#define PCGF_CACHE (1UL << PCG_CACHE) -#define PCGF_USED (1UL << PCG_USED) -#define PCGF_LOCK (1UL << PCG_LOCK) -/* Not used, but added here for completeness */ -#define PCGF_ACCT (1UL << PCG_ACCT) - /* for encoding cft->private value on file */ #define _MEM (0) #define _MEMSWAP (1) @@ -371,14 +366,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct page_cgroup *pc) +page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) { - struct mem_cgroup *mem = pc->mem_cgroup; - int nid = page_cgroup_nid(pc); - int zid = page_cgroup_zid(pc); - - if (!mem) - return NULL; + int nid = page_to_nid(page); + int zid = page_zonenum(page); return mem_cgroup_zoneinfo(mem, nid, zid); } @@ -504,11 +495,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) } } -static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) -{ - return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; -} - static struct mem_cgroup_per_zone * __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) { @@ -565,11 +551,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) * common workload, threashold and synchonization as vmstat[] should be * implemented. */ -static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx) +static long mem_cgroup_read_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { + long val = 0; int cpu; - s64 val = 0; get_online_cpus(); for_each_online_cpu(cpu) @@ -583,9 +569,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, return val; } -static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) +static long mem_cgroup_local_usage(struct mem_cgroup *mem) { - s64 ret; + long ret; ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); @@ -599,6 +585,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } +static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, + enum mem_cgroup_events_index idx) +{ + unsigned long val = 0; + int cpu; + + for_each_online_cpu(cpu) + val += per_cpu(mem->stat->events[idx], cpu); +#ifdef CONFIG_HOTPLUG_CPU + spin_lock(&mem->pcp_counter_lock); + val += mem->nocpu_base.events[idx]; + spin_unlock(&mem->pcp_counter_lock); +#endif + return val; +} + static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, bool file, int nr_pages) { @@ -611,13 +613,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); + __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); else { - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); + __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); preempt_enable(); } @@ -637,13 +639,34 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, return total; } -static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) +static bool __memcg_event_check(struct mem_cgroup *mem, int target) { - s64 val; + unsigned long val, next; - val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); + val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); + next = this_cpu_read(mem->stat->targets[target]); + /* from time_after() in jiffies.h */ + return ((long)next - (long)val < 0); +} - return !(val & ((1 << event_mask_shift) - 1)); +static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) +{ + unsigned long val, next; + + val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); + + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + default: + return; + } + + this_cpu_write(mem->stat->targets[target], next); } /* @@ -653,10 +676,15 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) static void memcg_check_events(struct mem_cgroup *mem, struct page *page) { /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { + if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { mem_cgroup_threshold(mem); - if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) + __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); + if (unlikely(__memcg_event_check(mem, + MEM_CGROUP_TARGET_SOFTLIMIT))){ mem_cgroup_update_tree(mem, page); + __mem_cgroup_target_update(mem, + MEM_CGROUP_TARGET_SOFTLIMIT); + } } } @@ -815,7 +843,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); if (mem_cgroup_is_root(pc->mem_cgroup)) @@ -851,7 +879,7 @@ void mem_cgroup_rotate_reclaimable_page(struct page *page) smp_rmb(); if (mem_cgroup_is_root(pc->mem_cgroup)) return; - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move_tail(&pc->lru, &mz->lists[lru]); } @@ -871,7 +899,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) smp_rmb(); if (mem_cgroup_is_root(pc->mem_cgroup)) return; - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move(&pc->lru, &mz->lists[lru]); } @@ -888,7 +916,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); @@ -898,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) } /* - * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to - * lru because the page may.be reused after it's fully uncharged (because of - * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge - * it again. This function is only used to charge SwapCache. It's done under - * lock_page and expected that zone->lru_lock is never held. + * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed + * while it's linked to lru because the page may be reused after it's fully + * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. + * It's done under lock_page and expected that zone->lru_lock isnever held. */ -static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) +static void mem_cgroup_lru_del_before_commit(struct page *page) { unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Doing this check without taking ->lru_lock seems wrong but this + * is safe. Because if page_cgroup's USED bit is unset, the page + * will not be added to any memcg's LRU. If page_cgroup's USED bit is + * set, the commit after this will fail, anyway. + * This all charge/uncharge is done under some mutual execustion. + * So, we don't need to taking care of changes in USED bit. + */ + if (likely(!PageLRU(page))) + return; + spin_lock_irqsave(&zone->lru_lock, flags); /* * Forget old LRU when this page_cgroup is *not* used. This Used bit @@ -920,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) spin_unlock_irqrestore(&zone->lru_lock, flags); } -static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) +static void mem_cgroup_lru_add_after_commit(struct page *page) { unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); + /* taking care of that the page is added to LRU while we commit it */ + if (likely(!PageLRU(page))) + return; spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ if (PageLRU(page) && !PageCgroupAcctLRU(pc)) @@ -1058,10 +1099,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); - mz = page_cgroup_zoneinfo(pc); - if (!mz) - return NULL; - + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); return &mz->reclaim_stat; } @@ -1093,9 +1131,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, if (scan >= nr_to_scan) break; - page = pc->page; if (unlikely(!PageCgroupUsed(pc))) continue; + + page = lookup_cgroup_page(pc); + if (unlikely(!PageLRU(page))) continue; @@ -1127,49 +1167,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) -static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) -{ - if (do_swap_account) { - if (res_counter_check_under_limit(&mem->res) && - res_counter_check_under_limit(&mem->memsw)) - return true; - } else - if (res_counter_check_under_limit(&mem->res)) - return true; - return false; -} - /** - * mem_cgroup_check_margin - check if the memory cgroup allows charging - * @mem: memory cgroup to check - * @bytes: the number of bytes the caller intends to charge + * mem_cgroup_margin - calculate chargeable space of a memory cgroup + * @mem: the memory cgroup * - * Returns a boolean value on whether @mem can be charged @bytes or - * whether this would exceed the limit. + * Returns the maximum amount of memory @mem can be charged with, in + * pages. */ -static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) { - if (!res_counter_check_margin(&mem->res, bytes)) - return false; - if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) - return false; - return true; + unsigned long long margin; + + margin = res_counter_margin(&mem->res); + if (do_swap_account) + margin = min(margin, res_counter_margin(&mem->memsw)); + return margin >> PAGE_SHIFT; } static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; - unsigned int swappiness; /* root ? */ if (cgrp->parent == NULL) return vm_swappiness; - spin_lock(&memcg->reclaim_param_lock); - swappiness = memcg->swappiness; - spin_unlock(&memcg->reclaim_param_lock); - - return swappiness; + return memcg->swappiness; } static void mem_cgroup_start_move(struct mem_cgroup *mem) @@ -1385,13 +1408,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) rcu_read_unlock(); /* Updates scanning parameter */ - spin_lock(&root_mem->reclaim_param_lock); if (!css) { /* this means start scan from ID:1 */ root_mem->last_scanned_child = 0; } else root_mem->last_scanned_child = found; - spin_unlock(&root_mem->reclaim_param_lock); } return ret; @@ -1420,7 +1441,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; - unsigned long excess = mem_cgroup_get_excess(root_mem); + unsigned long excess; + + excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ if (root_mem->memsw_is_minimum) @@ -1477,9 +1500,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return ret; total += ret; if (check_soft) { - if (res_counter_check_under_soft_limit(&root_mem->res)) + if (!res_counter_soft_limit_excess(&root_mem->res)) return total; - } else if (mem_cgroup_check_under_limit(root_mem)) + } else if (mem_cgroup_margin(root_mem)) return 1 + total; } return total; @@ -1687,17 +1710,17 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat); * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. */ -#define CHARGE_SIZE (32 * PAGE_SIZE) +#define CHARGE_BATCH 32U struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ - int charge; + unsigned int nr_pages; struct work_struct work; }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static atomic_t memcg_drain_count; /* - * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed + * Try to consume stocked charge on this cpu. If success, one page is consumed * from local stock and true is returned. If the stock is 0 or charges from a * cgroup which is not current target, returns false. This stock will be * refilled. @@ -1708,8 +1731,8 @@ static bool consume_stock(struct mem_cgroup *mem) bool ret = true; stock = &get_cpu_var(memcg_stock); - if (mem == stock->cached && stock->charge) - stock->charge -= PAGE_SIZE; + if (mem == stock->cached && stock->nr_pages) + stock->nr_pages--; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -1723,13 +1746,15 @@ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; - if (stock->charge) { - res_counter_uncharge(&old->res, stock->charge); + if (stock->nr_pages) { + unsigned long bytes = stock->nr_pages * PAGE_SIZE; + + res_counter_uncharge(&old->res, bytes); if (do_swap_account) - res_counter_uncharge(&old->memsw, stock->charge); + res_counter_uncharge(&old->memsw, bytes); + stock->nr_pages = 0; } stock->cached = NULL; - stock->charge = 0; } /* @@ -1746,7 +1771,7 @@ static void drain_local_stock(struct work_struct *dummy) * Cache charges(val) which is from res_counter, to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *mem, int val) +static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) { struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); @@ -1754,7 +1779,7 @@ static void refill_stock(struct mem_cgroup *mem, int val) drain_stock(stock); stock->cached = mem; } - stock->charge += val; + stock->nr_pages += nr_pages; put_cpu_var(memcg_stock); } @@ -1806,11 +1831,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) spin_lock(&mem->pcp_counter_lock); for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { - s64 x = per_cpu(mem->stat->count[i], cpu); + long x = per_cpu(mem->stat->count[i], cpu); per_cpu(mem->stat->count[i], cpu) = 0; mem->nocpu_base.count[i] += x; } + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + unsigned long x = per_cpu(mem->stat->events[i], cpu); + + per_cpu(mem->stat->events[i], cpu) = 0; + mem->nocpu_base.events[i] += x; + } /* need to clear ON_MOVE value, works as a kind of lock. */ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; spin_unlock(&mem->pcp_counter_lock); @@ -1860,9 +1891,10 @@ enum { CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; -static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, - int csize, bool oom_check) +static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, + unsigned int nr_pages, bool oom_check) { + unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; unsigned long flags = 0; @@ -1883,14 +1915,13 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * csize can be either a huge page (HPAGE_SIZE), a batch of - * regular pages (CHARGE_SIZE), or a single regular page - * (PAGE_SIZE). + * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch + * of regular pages (CHARGE_BATCH), or a single regular page (1). * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (csize == CHARGE_SIZE) + if (nr_pages == CHARGE_BATCH) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) @@ -1898,7 +1929,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, gfp_mask, flags); - if (mem_cgroup_check_margin(mem_over_limit, csize)) + if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* * Even though the limit is exceeded at this point, reclaim @@ -1909,7 +1940,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (csize == PAGE_SIZE && ret) + if (nr_pages == 1 && ret) return CHARGE_RETRY; /* @@ -1935,13 +1966,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, - struct mem_cgroup **memcg, bool oom, - int page_size) + unsigned int nr_pages, + struct mem_cgroup **memcg, + bool oom) { + unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1966,7 +1998,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (page_size == PAGE_SIZE && consume_stock(mem)) + if (nr_pages == 1 && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1989,7 +2021,7 @@ again: rcu_read_unlock(); goto done; } - if (page_size == PAGE_SIZE && consume_stock(mem)) { + if (nr_pages == 1 && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -2024,13 +2056,12 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); - + ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); switch (ret) { case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = page_size; + batch = nr_pages; css_put(&mem->css); mem = NULL; goto again; @@ -2051,8 +2082,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > page_size) - refill_stock(mem, csize - page_size); + if (batch > nr_pages) + refill_stock(mem, batch - nr_pages); css_put(&mem->css); done: *memcg = mem; @@ -2071,21 +2102,17 @@ bypass: * gotten by try_charge(). */ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, - unsigned long count) + unsigned int nr_pages) { if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE * count); + unsigned long bytes = nr_pages * PAGE_SIZE; + + res_counter_uncharge(&mem->res, bytes); if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); + res_counter_uncharge(&mem->memsw, bytes); } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, - int page_size) -{ - __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); -} - /* * A helper function to get mem_cgroup from ID. must be called under * rcu_read_lock(). The caller must check css_is_removed() or some if @@ -2134,20 +2161,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) } static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page *page, + unsigned int nr_pages, struct page_cgroup *pc, - enum charge_type ctype, - int page_size) + enum charge_type ctype) { - int nr_pages = page_size >> PAGE_SHIFT; - - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem, page_size); + __mem_cgroup_cancel_charge(mem, nr_pages); return; } /* @@ -2184,7 +2206,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ - memcg_check_events(mem, pc->page); + memcg_check_events(mem, page); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2221,7 +2243,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) * We hold lru_lock, then, reduce counter directly. */ lru = page_lru(head); - mz = page_cgroup_zoneinfo(head_pc); + mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); MEM_CGROUP_ZSTAT(mz, lru) -= 1; } tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; @@ -2230,7 +2252,9 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) #endif /** - * __mem_cgroup_move_account - move account of the page + * mem_cgroup_move_account - move account of the page + * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. @@ -2238,25 +2262,42 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) - * - the pc is locked, used, and ->mem_cgroup points to @from. + * - compound_lock is held when nr_pages > 1 * * This function doesn't do "charge" nor css_get to new cgroup. It should be * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is * true, this function does "uncharge" from old cgroup, but it doesn't if * @uncharge is false, so a caller should do "uncharge". */ - -static void __mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, - int charge_size) +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct page_cgroup *pc, + struct mem_cgroup *from, + struct mem_cgroup *to, + bool uncharge) { - int nr_pages = charge_size >> PAGE_SHIFT; + unsigned long flags; + int ret; VM_BUG_ON(from == to); - VM_BUG_ON(PageLRU(pc->page)); - VM_BUG_ON(!page_is_cgroup_locked(pc)); - VM_BUG_ON(!PageCgroupUsed(pc)); - VM_BUG_ON(pc->mem_cgroup != from); + VM_BUG_ON(PageLRU(page)); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (nr_pages > 1 && !PageTransHuge(page)) + goto out; + + lock_page_cgroup(pc); + + ret = -EINVAL; + if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) + goto unlock; + + move_lock_page_cgroup(pc, &flags); if (PageCgroupFileMapped(pc)) { /* Update mapped_file data for mem_cgroup */ @@ -2268,7 +2309,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from, charge_size); + __mem_cgroup_cancel_charge(from, nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -2280,40 +2321,16 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, * garanteed that "to" is never removed. So, we don't check rmdir * status here. */ -} - -/* - * check whether the @pc is valid for moving account and call - * __mem_cgroup_move_account() - */ -static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, - bool uncharge, int charge_size) -{ - int ret = -EINVAL; - unsigned long flags; - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) - return -EBUSY; - - lock_page_cgroup(pc); - if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { - move_lock_page_cgroup(pc, &flags); - __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); - move_unlock_page_cgroup(pc, &flags); - ret = 0; - } + move_unlock_page_cgroup(pc, &flags); + ret = 0; +unlock: unlock_page_cgroup(pc); /* * check events */ - memcg_check_events(to, pc->page); - memcg_check_events(from, pc->page); + memcg_check_events(to, page); + memcg_check_events(from, page); +out: return ret; } @@ -2321,16 +2338,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, * move charges to its parent. */ -static int mem_cgroup_move_parent(struct page_cgroup *pc, +static int mem_cgroup_move_parent(struct page *page, + struct page_cgroup *pc, struct mem_cgroup *child, gfp_t gfp_mask) { - struct page *page = pc->page; struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; - int page_size = PAGE_SIZE; - unsigned long flags; + unsigned int nr_pages; + unsigned long uninitialized_var(flags); int ret; /* Is ROOT ? */ @@ -2343,23 +2360,21 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (isolate_lru_page(page)) goto put; - if (PageTransHuge(page)) - page_size = HPAGE_SIZE; + nr_pages = hpage_nr_pages(page); parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, - &parent, false, page_size); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); if (ret || !parent) goto put_back; - if (page_size > PAGE_SIZE) + if (nr_pages > 1) flags = compound_lock_irqsave(page); - ret = mem_cgroup_move_account(pc, child, parent, true, page_size); + ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); if (ret) - mem_cgroup_cancel_charge(parent, page_size); + __mem_cgroup_cancel_charge(parent, nr_pages); - if (page_size > PAGE_SIZE) + if (nr_pages > 1) compound_unlock_irqrestore(page, flags); put_back: putback_lru_page(page); @@ -2379,13 +2394,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; - int page_size = PAGE_SIZE; + unsigned int nr_pages = 1; struct page_cgroup *pc; bool oom = true; int ret; if (PageTransHuge(page)) { - page_size <<= compound_order(page); + nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); /* * Never OOM-kill a process for a huge page. The @@ -2395,16 +2410,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, } pc = lookup_page_cgroup(page); - /* can happen at boot */ - if (unlikely(!pc)) - return 0; - prefetchw(pc); + BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); + ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype, page_size); + __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); return 0; } @@ -2432,9 +2444,26 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype); +static void +__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, + enum charge_type ctype) +{ + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * In some case, SwapCache, FUSE(splice_buf->radixtree), the page + * is already on LRU. It means the page may on some other page_cgroup's + * LRU. Take care of it. + */ + mem_cgroup_lru_del_before_commit(page); + __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); + mem_cgroup_lru_add_after_commit(page); + return; +} + int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + struct mem_cgroup *mem = NULL; int ret; if (mem_cgroup_disabled()) @@ -2469,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (unlikely(!mm)) mm = &init_mm; - if (page_is_file_cache(page)) - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE); + if (page_is_file_cache(page)) { + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); + if (ret || !mem) + return ret; + /* + * FUSE reuses pages without going through the final + * put that would remove them from the LRU list, make + * sure that they get relinked properly. + */ + __mem_cgroup_commit_charge_lrucare(page, mem, + MEM_CGROUP_CHARGE_TYPE_CACHE); + return ret; + } /* shmem */ if (PageSwapCache(page)) { - struct mem_cgroup *mem = NULL; - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); if (!ret) __mem_cgroup_commit_charge_swapin(page, mem, @@ -2501,6 +2538,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct mem_cgroup *mem; int ret; + *ptr = NULL; + if (mem_cgroup_disabled()) return 0; @@ -2518,30 +2557,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); + return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); } static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype) { - struct page_cgroup *pc; - if (mem_cgroup_disabled()) return; if (!ptr) return; cgroup_exclude_rmdir(&ptr->css); - pc = lookup_page_cgroup(page); - mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); - mem_cgroup_lru_add_after_commit_swapcache(page); + + __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -2589,15 +2624,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem, PAGE_SIZE); + __mem_cgroup_cancel_charge(mem, 1); } -static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, - int page_size) +static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, + unsigned int nr_pages, + const enum charge_type ctype) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; + /* If swapout, usage of swap doesn't decrease */ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) uncharge_memsw = false; @@ -2621,7 +2657,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; - if (page_size != PAGE_SIZE) + if (nr_pages > 1) goto direct_uncharge; /* @@ -2632,14 +2668,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, if (batch->memcg != mem) goto direct_uncharge; /* remember freed charge and uncharge it later */ - batch->bytes += PAGE_SIZE; + batch->nr_pages++; if (uncharge_memsw) - batch->memsw_bytes += PAGE_SIZE; + batch->memsw_nr_pages++; return; direct_uncharge: - res_counter_uncharge(&mem->res, page_size); + res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, page_size); + res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2651,10 +2687,9 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { - int count; - struct page_cgroup *pc; struct mem_cgroup *mem = NULL; - int page_size = PAGE_SIZE; + unsigned int nr_pages = 1; + struct page_cgroup *pc; if (mem_cgroup_disabled()) return NULL; @@ -2663,11 +2698,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) return NULL; if (PageTransHuge(page)) { - page_size <<= compound_order(page); + nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); } - - count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2700,7 +2733,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); ClearPageCgroupUsed(pc); /* @@ -2721,7 +2754,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype, page_size); + mem_cgroup_do_uncharge(mem, nr_pages, ctype); return mem; @@ -2761,8 +2794,8 @@ void mem_cgroup_uncharge_start(void) /* We can do nest. */ if (current->memcg_batch.do_batch == 1) { current->memcg_batch.memcg = NULL; - current->memcg_batch.bytes = 0; - current->memcg_batch.memsw_bytes = 0; + current->memcg_batch.nr_pages = 0; + current->memcg_batch.memsw_nr_pages = 0; } } @@ -2783,10 +2816,12 @@ void mem_cgroup_uncharge_end(void) * This "batch->memcg" is valid without any css_get/put etc... * bacause we hide charges behind us. */ - if (batch->bytes) - res_counter_uncharge(&batch->memcg->res, batch->bytes); - if (batch->memsw_bytes) - res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + if (batch->nr_pages) + res_counter_uncharge(&batch->memcg->res, + batch->nr_pages * PAGE_SIZE); + if (batch->memsw_nr_pages) + res_counter_uncharge(&batch->memcg->memsw, + batch->memsw_nr_pages * PAGE_SIZE); memcg_oom_recover(batch->memcg); /* forget this pointer (for sanity check) */ batch->memcg = NULL; @@ -2911,11 +2946,13 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) { - struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + struct page_cgroup *pc; enum charge_type ctype; int ret = 0; + *ptr = NULL; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2966,7 +3003,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, gfp_mask, ptr, false, PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2993,7 +3030,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); + __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); return ret; } @@ -3058,7 +3095,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *mem; int ret; if (mem_cgroup_disabled()) @@ -3071,6 +3108,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, return ret; } +#ifdef CONFIG_DEBUG_VM +static struct page_cgroup *lookup_page_cgroup_used(struct page *page) +{ + struct page_cgroup *pc; + + pc = lookup_page_cgroup(page); + if (likely(pc) && PageCgroupUsed(pc)) + return pc; + return NULL; +} + +bool mem_cgroup_bad_page_check(struct page *page) +{ + if (mem_cgroup_disabled()) + return false; + + return lookup_page_cgroup_used(page) != NULL; +} + +void mem_cgroup_print_bad_page(struct page *page) +{ + struct page_cgroup *pc; + + pc = lookup_page_cgroup_used(page); + if (pc) { + int ret = -1; + char *path; + + printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", + pc, pc->flags, pc->mem_cgroup); + + path = kmalloc(PATH_MAX, GFP_KERNEL); + if (path) { + rcu_read_lock(); + ret = cgroup_path(pc->mem_cgroup->css.cgroup, + path, PATH_MAX); + rcu_read_unlock(); + } + + printk(KERN_CONT "(%s)\n", + (ret < 0) ? "cannot get the path" : path); + kfree(path); + } +} +#endif + static DEFINE_MUTEX(set_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, @@ -3314,6 +3397,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, loop += 256; busy = NULL; while (loop--) { + struct page *page; + ret = 0; spin_lock_irqsave(&zone->lru_lock, flags); if (list_empty(list)) { @@ -3329,7 +3414,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, } spin_unlock_irqrestore(&zone->lru_lock, flags); - ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); + page = lookup_cgroup_page(pc); + + ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); if (ret == -ENOMEM) break; @@ -3477,13 +3564,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, } -static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx) +static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; - s64 val = 0; + long val = 0; - /* each per cpu's value can be minus.Then, use s64 */ + /* Per-cpu values can be negative, use a signed accumulator */ for_each_mem_cgroup_tree(iter, mem) val += mem_cgroup_read_stat(iter, idx); @@ -3503,12 +3590,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) return res_counter_read_u64(&mem->memsw, RES_USAGE); } - val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); if (swap) - val += mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_SWAPOUT); + val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); return val << PAGE_SHIFT; } @@ -3728,9 +3814,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) s->stat[MCS_RSS] += val * PAGE_SIZE; val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); s->stat[MCS_PGPGOUT] += val; if (do_swap_account) { val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); @@ -3854,9 +3940,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; } - spin_lock(&memcg->reclaim_param_lock); memcg->swappiness = val; - spin_unlock(&memcg->reclaim_param_lock); cgroup_unlock(); @@ -4512,7 +4596,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->memsw, NULL); } mem->last_scanned_child = 0; - spin_lock_init(&mem->reclaim_param_lock); INIT_LIST_HEAD(&mem->oom_notify); if (parent) @@ -4600,8 +4683,7 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, - PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4947,8 +5029,8 @@ retry: if (isolate_lru_page(page)) goto put; pc = lookup_page_cgroup(page); - if (!mem_cgroup_move_account(pc, - mc.from, mc.to, false, PAGE_SIZE)) { + if (!mem_cgroup_move_account(page, 1, pc, + mc.from, mc.to, false)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e0af336530c..37feb9fec22 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -945,7 +945,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, collect_procs(ppage, &tokill); if (hpage != ppage) - lock_page_nosync(ppage); + lock_page(ppage); ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) @@ -1038,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * Check "just unpoisoned", "filter hit", and * "race with other subpage." */ - lock_page_nosync(hpage); + lock_page(hpage); if (!PageHWPoison(hpage) || (hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { @@ -1088,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - lock_page_nosync(hpage); + lock_page(hpage); /* * unpoison always clear PG_hwpoison inside page lock @@ -1231,7 +1231,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - lock_page_nosync(page); + lock_page(page); /* * This test is racy because PG_hwpoison is set outside of page lock. * That's acceptable because that won't trigger kernel panic. Instead, diff --git a/mm/memory.c b/mm/memory.c index 615be5127ce..51a5c23704a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1486,9 +1486,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(tsk, start)) { + if (!vma && in_gate_area(mm, start)) { unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk); + struct vm_area_struct *gate_vma = get_gate_vma(mm); pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -1591,10 +1591,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? i : -EFAULT; BUG(); } - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } if (ret & VM_FAULT_RETRY) { if (nonblocking) @@ -1641,7 +1644,8 @@ EXPORT_SYMBOL(__get_user_pages); /** * get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin @@ -2767,7 +2771,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swp_entry_t entry; pte_t pte; int locked; - struct mem_cgroup *ptr = NULL; + struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; @@ -3499,7 +3503,7 @@ static int __init gate_vma_init(void) __initcall(gate_vma_init); #endif -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef AT_SYSINFO_EHDR return &gate_vma; @@ -3508,7 +3512,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) #endif } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { #ifdef AT_SYSINFO_EHDR if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) @@ -3649,20 +3653,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, #endif /* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages + * Access another process' address space as given in mm. If non-NULL, use the + * given task for page fault accounting. */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) { - struct mm_struct *mm; struct vm_area_struct *vma; void *old_buf = buf; - mm = get_task_mm(tsk); - if (!mm) - return 0; - down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ while (len) { @@ -3711,11 +3710,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in addr += bytes; } up_read(&mm->mmap_sem); - mmput(mm); return buf - old_buf; } +/** + * @access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, int write) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = __access_remote_vm(tsk, mm, addr, buf, len, write); + mmput(mm); + + return ret; +} + /* * Print the name of a VMA. */ diff --git a/mm/migrate.c b/mm/migrate.c index 89e5c3fe8bb..b0406d739ea 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -633,7 +633,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, struct page *newpage = get_new_page(page, private, &result); int remap_swapcache = 1; int charge = 0; - struct mem_cgroup *mem = NULL; + struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; if (!newpage) diff --git a/mm/mlock.c b/mm/mlock.c index c3924c7f00b..2689a08c79a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -237,7 +237,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current))) { + vma == get_gate_vma(current->mm))) { __mlock_vma_pages_range(vma, start, end, NULL); @@ -332,7 +332,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int lock = newflags & VM_LOCKED; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || - is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) goto out; /* don't set VM_LOCKED, don't count */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e2bdb07079c..e99f6cd1da1 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -32,14 +32,6 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -#ifdef CONFIG_CRASH_DUMP -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; -#endif - static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { diff --git a/mm/nommu.c b/mm/nommu.c index f59e1424d3d..cb86e7d5e7f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1842,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} - unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -1963,7 +1959,7 @@ error: return -ENOMEM; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3100bc57036..6a819d1b2c7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -406,7 +406,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(current); dump_stack(); mem_cgroup_print_oom_info(mem, p); - __show_mem(SHOW_MEM_FILTER_NODES); + show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(mem, nodemask); } @@ -549,6 +549,17 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) unsigned int points = 0; struct task_struct *p; + /* + * If current has a pending SIGKILL, then automatically select it. The + * goal is to allow it to allocate so that it may quickly exit and free + * its memory. + */ + if (fatal_signal_pending(current)) { + set_thread_flag(TIF_MEMDIE); + boost_dying_task_prio(current, NULL); + return; + } + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; read_lock(&tasklist_lock); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 632b46479c9..31f69886242 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1040,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc, int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct blk_plug plug; + int ret; + /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; - return write_cache_pages(mapping, wbc, __writepage, mapping); + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + return ret; } EXPORT_SYMBOL(generic_writepages); @@ -1251,7 +1257,7 @@ int set_page_dirty_lock(struct page *page) { int ret; - lock_page_nosync(page); + lock_page(page); ret = set_page_dirty(page); unlock_page(page); return ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3a58221f4c2..d6e7ba7373b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ #include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> +#include <linux/memcontrol.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -565,7 +566,8 @@ static inline int free_pages_check(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { + (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -754,7 +756,8 @@ static inline int check_new_page(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { + (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -2192,7 +2195,7 @@ nopage: current->comm, order, gfp_mask); dump_stack(); if (!should_suppress_show_mem()) - __show_mem(filter); + show_mem(filter); } return page; got_pg: @@ -5684,4 +5687,5 @@ void dump_page(struct page *page) page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); + mem_cgroup_print_bad_page(page); } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 59a3cd4c799..a12cc3fa985 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,12 +11,11 @@ #include <linux/swapops.h> #include <linux/kmemleak.h> -static void __meminit -__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) +static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) { pc->flags = 0; + set_page_cgroup_array_id(pc, id); pc->mem_cgroup = NULL; - pc->page = pfn_to_page(pfn); INIT_LIST_HEAD(&pc->lru); } static unsigned long total_usage; @@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return base + offset; } +struct page *lookup_cgroup_page(struct page_cgroup *pc) +{ + unsigned long pfn; + struct page *page; + pg_data_t *pgdat; + + pgdat = NODE_DATA(page_cgroup_array_id(pc)); + pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; + page = pfn_to_page(pfn); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + static int __init alloc_node_page_cgroup(int nid) { struct page_cgroup *base, *pc; @@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid) return -ENOMEM; for (index = 0; index < nr_pages; index++) { pc = base + index; - __init_page_cgroup(pc, start_pfn + index); + init_page_cgroup(pc, nid); } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; @@ -105,46 +117,75 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return section->page_cgroup + pfn; } -/* __alloc_bootmem...() is protected by !slab_available() */ +struct page *lookup_cgroup_page(struct page_cgroup *pc) +{ + struct mem_section *section; + struct page *page; + unsigned long nr; + + nr = page_cgroup_array_id(pc); + section = __nr_to_section(nr); + page = pfn_to_page(pc - section->page_cgroup); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + +static void *__init_refok alloc_page_cgroup(size_t size, int nid) +{ + void *addr = NULL; + + addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN); + if (addr) + return addr; + + if (node_state(nid, N_HIGH_MEMORY)) + addr = vmalloc_node(size, nid); + else + addr = vmalloc(size); + + return addr; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_cgroup(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} +#endif + static int __init_refok init_section_page_cgroup(unsigned long pfn) { - struct mem_section *section = __pfn_to_section(pfn); struct page_cgroup *base, *pc; + struct mem_section *section; unsigned long table_size; + unsigned long nr; int nid, index; - if (!section->page_cgroup) { - nid = page_to_nid(pfn_to_page(pfn)); - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - VM_BUG_ON(!slab_is_available()); - if (node_state(nid, N_HIGH_MEMORY)) { - base = kmalloc_node(table_size, - GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); - if (!base) - base = vmalloc(table_size); - } - /* - * The value stored in section->page_cgroup is (base - pfn) - * and it does not point to the memory block allocated above, - * causing kmemleak false positives. - */ - kmemleak_not_leak(base); - } else { - /* - * We don't have to allocate page_cgroup again, but - * address of memmap may be changed. So, we have to initialize - * again. - */ - base = section->page_cgroup + pfn; - table_size = 0; - /* check address of memmap is changed or not. */ - if (base->page == pfn_to_page(pfn)) - return 0; - } + nr = pfn_to_section_nr(pfn); + section = __nr_to_section(nr); + + if (section->page_cgroup) + return 0; + + nid = page_to_nid(pfn_to_page(pfn)); + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + base = alloc_page_cgroup(table_size, nid); + + /* + * The value stored in section->page_cgroup is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); if (!base) { printk(KERN_ERR "page cgroup allocation failure\n"); @@ -153,7 +194,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) for (index = 0; index < PAGES_PER_SECTION; index++) { pc = base + index; - __init_page_cgroup(pc, pfn + index); + init_page_cgroup(pc, nr); } section->page_cgroup = base - pfn; @@ -170,16 +211,8 @@ void __free_page_cgroup(unsigned long pfn) if (!ms || !ms->page_cgroup) return; base = ms->page_cgroup + pfn; - if (is_vmalloc_addr(base)) { - vfree(base); - ms->page_cgroup = NULL; - } else { - struct page *page = virt_to_page(base); - if (!PageReserved(page)) { /* Is bootmem ? */ - kfree(base); - ms->page_cgroup = NULL; - } - } + free_page_cgroup(base); + ms->page_cgroup = NULL; } int __meminit online_page_cgroup(unsigned long start_pfn, diff --git a/mm/page_io.c b/mm/page_io.c index 2dee975bf46..dc76b4d0611 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); diff --git a/mm/readahead.c b/mm/readahead.c index 77506a291a2..2c0cc489e28 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages); static int read_pages(struct address_space *mapping, struct file *filp, struct list_head *pages, unsigned nr_pages) { + struct blk_plug plug; unsigned page_idx; int ret; + blk_start_plug(&plug); + if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); /* Clean up the remaining pages */ @@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp, page_cache_release(page); } ret = 0; + out: + blk_finish_plug(&plug); + return ret; } @@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping, /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); - -#ifdef CONFIG_BLOCK - /* - * Normally the current page is !uptodate and lock_page() will be - * immediately called to implicitly unplug the device. However this - * is not always true for RAID conifgurations, where data arrives - * not strictly in their submission order. In this case we need to - * explicitly kick off the IO. - */ - if (PageUptodate(page)) - blk_run_backing_dev(mapping->backing_dev_info, NULL); -#endif } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index 4a8e99a0fb9..8da044a1db0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -31,11 +31,12 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) - * inode_lock (in set_page_dirty's __mark_inode_dirty) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, - * within inode_lock in __sync_single_inode) + * within inode_wb_list_lock in __sync_single_inode) * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock diff --git a/mm/shmem.c b/mm/shmem.c index 91ce9a1024d..58da7c150ba 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, - .unplug_io_fn = default_unplug_io_fn, }; static LIST_HEAD(shmem_swaplist); diff --git a/mm/slub.c b/mm/slub.c index 93de30db95f..f881874843a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -849,11 +849,11 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) local_irq_save(flags); kmemcheck_slab_free(s, x, s->objsize); debug_check_no_locks_freed(x, s->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->objsize); local_irq_restore(flags); } #endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->objsize); } /* @@ -1604,7 +1604,7 @@ static inline void note_cmpxchg_failure(const char *n, void init_kmem_cache_cpus(struct kmem_cache *s) { -#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT) +#ifdef CONFIG_CMPXCHG_LOCAL int cpu; for_each_possible_cpu(cpu) diff --git a/mm/swap_state.c b/mm/swap_state.c index 5c8cfabbc9b..46680461785 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -24,12 +24,10 @@ /* * swapper_space is a fiction, retained to simplify the path through - * vmscan's shrink_page_list, to make sync_page look nicer, and to allow - * future use of radix_tree tags in the swap cache. + * vmscan's shrink_page_list. */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .sync_page = block_sync_page, .set_page_dirty = __set_page_dirty_nobuffers, .migratepage = migrate_page, }; @@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = { static struct backing_dev_info swap_backing_dev_info = { .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, - .unplug_io_fn = swap_unplug_io_fn, }; struct address_space swapper_space = { diff --git a/mm/swapfile.c b/mm/swapfile.c index aafcf3611b3..8c6b3ce38f0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -95,39 +95,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) } /* - * We need this because the bdev->unplug_fn can sleep and we cannot - * hold swap_lock while calling the unplug_fn. And swap_lock - * cannot be turned into a mutex. - */ -static DECLARE_RWSEM(swap_unplug_sem); - -void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) -{ - swp_entry_t entry; - - down_read(&swap_unplug_sem); - entry.val = page_private(page); - if (PageSwapCache(page)) { - struct block_device *bdev = swap_info[swp_type(entry)]->bdev; - struct backing_dev_info *bdi; - - /* - * If the page is removed from swapcache from under us (with a - * racy try_to_unuse/swapoff) we need an additional reference - * count to avoid reading garbage from page_private(page) above. - * If the WARN_ON triggers during a swapoff it maybe the race - * condition and it's harmless. However if it triggers without - * swapoff it signals a problem. - */ - WARN_ON(page_count(page) <= 1); - - bdi = bdev->bd_inode->i_mapping->backing_dev_info; - blk_run_backing_dev(bdi, page); - } - up_read(&swap_unplug_sem); -} - -/* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ @@ -880,7 +847,7 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { - struct mem_cgroup *ptr = NULL; + struct mem_cgroup *ptr; spinlock_t *ptl; pte_t *pte; int ret = 1; @@ -1662,10 +1629,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out_dput; } - /* wait for any unplug function to finish */ - down_write(&swap_unplug_sem); - up_write(&swap_unplug_sem); - destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); @@ -2088,7 +2051,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; - inode = mapping->host; for (i = 0; i < nr_swapfiles; i++) { struct swap_info_struct *q = swap_info[i]; @@ -2101,6 +2063,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } + inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; @@ -2187,8 +2151,10 @@ bad_swap: spin_unlock(&swap_lock); vfree(swap_map); if (swap_file) { - if (inode && S_ISREG(inode->i_mode)) + if (inode && S_ISREG(inode->i_mode)) { mutex_unlock(&inode->i_mutex); + inode = NULL; + } filp_close(swap_file, NULL); } out: diff --git a/mm/vmscan.c b/mm/vmscan.c index 060e4c19140..f73b8657c2d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -358,7 +358,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi, static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page_nosync(page); + lock_page(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); |