diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 23 | ||||
-rw-r--r-- | mm/Kconfig.debug | 25 | ||||
-rw-r--r-- | mm/Makefile | 9 | ||||
-rw-r--r-- | mm/backing-dev.c | 99 | ||||
-rw-r--r-- | mm/bootmem.c | 188 | ||||
-rw-r--r-- | mm/cleancache.c | 244 | ||||
-rw-r--r-- | mm/compaction.c | 135 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/failslab.c | 39 | ||||
-rw-r--r-- | mm/filemap.c | 401 | ||||
-rw-r--r-- | mm/filemap_xip.c | 4 | ||||
-rw-r--r-- | mm/fremap.c | 6 | ||||
-rw-r--r-- | mm/huge_memory.c | 150 | ||||
-rw-r--r-- | mm/hugetlb.c | 89 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 2 | ||||
-rw-r--r-- | mm/init-mm.c | 3 | ||||
-rw-r--r-- | mm/internal.h | 11 | ||||
-rw-r--r-- | mm/kmemleak.c | 15 | ||||
-rw-r--r-- | mm/ksm.c | 38 | ||||
-rw-r--r-- | mm/maccess.c | 8 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memblock.c | 249 | ||||
-rw-r--r-- | mm/memcontrol.c | 1573 | ||||
-rw-r--r-- | mm/memory-failure.c | 182 | ||||
-rw-r--r-- | mm/memory.c | 778 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 101 | ||||
-rw-r--r-- | mm/mempolicy.c | 199 | ||||
-rw-r--r-- | mm/migrate.c | 77 | ||||
-rw-r--r-- | mm/mincore.c | 11 | ||||
-rw-r--r-- | mm/mlock.c | 30 | ||||
-rw-r--r-- | mm/mmap.c | 201 | ||||
-rw-r--r-- | mm/mremap.c | 16 | ||||
-rw-r--r-- | mm/nobootmem.c | 404 | ||||
-rw-r--r-- | mm/nommu.c | 212 | ||||
-rw-r--r-- | mm/oom_kill.c | 137 | ||||
-rw-r--r-- | mm/page-writeback.c | 316 | ||||
-rw-r--r-- | mm/page_alloc.c | 457 | ||||
-rw-r--r-- | mm/page_cgroup.c | 245 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/pagewalk.c | 73 | ||||
-rw-r--r-- | mm/percpu.c | 19 | ||||
-rw-r--r-- | mm/prio_tree.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 20 | ||||
-rw-r--r-- | mm/rmap.c | 440 | ||||
-rw-r--r-- | mm/shmem.c | 2224 | ||||
-rw-r--r-- | mm/slab.c | 187 | ||||
-rw-r--r-- | mm/slob.c | 14 | ||||
-rw-r--r-- | mm/slub.c | 1262 | ||||
-rw-r--r-- | mm/sparse.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 244 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 468 | ||||
-rw-r--r-- | mm/thrash.c | 120 | ||||
-rw-r--r-- | mm/truncate.c | 199 | ||||
-rw-r--r-- | mm/util.c | 26 | ||||
-rw-r--r-- | mm/vmalloc.c | 194 | ||||
-rw-r--r-- | mm/vmscan.c | 448 | ||||
-rw-r--r-- | mm/vmstat.c | 273 |
58 files changed, 7997 insertions, 4907 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e9c0c61f2dd..f2f1ca19ed5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -347,3 +347,26 @@ config NEED_PER_CPU_KM depends on !SMP bool default y + +config CLEANCACHE + bool "Enable cleancache driver to cache clean pages if tmem is present" + default n + help + Cleancache can be thought of as a page-granularity victim cache + for clean pages that the kernel's pageframe replacement algorithm + (PFRA) would like to keep around, but can't since there isn't enough + memory. So when the PFRA "evicts" a page, it first attempts to use + cleancache code to put the data contained in that page into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. And when a cleancache-enabled + filesystem wishes to access a page in a file on disk, it first + checks cleancache to see if it already contains it; if it does, + the page is copied into the kernel and a disk access is avoided. + When a transcendent memory driver is available (such as zcache or + Xen transcendent memory), a significant I/O reduction + may be achieved. When none is available, all cleancache calls + are reduced to a single pointer-compare-against-NULL resulting + in a negligible performance hit. + + If unsure, say Y to enable cleancache diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index af7cfb43d2f..8b1a477162d 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -1,27 +1,24 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" - depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on !HIBERNATION || !PPC && !SPARC + depends on DEBUG_KERNEL + depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC depends on !KMEMCHECK + select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types of memory corruption. + For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, + fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages(). Additionally, + this option cannot be enabled in combination with hibernation as + that would result in incorrect warnings of memory corruption after + a resume because free pages are not saved to the suspend image. + config WANT_PAGE_DEBUG_FLAGS bool config PAGE_POISONING - bool "Debug page memory allocations" - depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on !HIBERNATION - select DEBUG_PAGEALLOC + bool select WANT_PAGE_DEBUG_FLAGS - ---help--- - Fill the pages with poison patterns after free_pages() and verify - the patterns before alloc_pages(). This results in a large slowdown, - but helps to find certain types of memory corruption. - - This option cannot be enabled in combination with hibernation as - that would result in incorrect warnings of memory corruption after - a resume because free pages are not saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index 2b1b575ae71..836e4163c1b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ @@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ $(mmu-y) obj-y += init-mm.o +ifdef CONFIG_NO_BOOTMEM + obj-y += nobootmem.o +else + obj-y += bootmem.o +endif + obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o @@ -43,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_CLEANCACHE) += cleancache.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d3022..d6edf8d14f9 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,17 +14,11 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} -EXPORT_SYMBOL(default_unplug_io_fn); - struct backing_dev_info default_backing_dev_info = { .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, - .unplug_io_fn = default_unplug_io_fn, }; EXPORT_SYMBOL_GPL(default_backing_dev_info); @@ -51,6 +45,17 @@ static struct timer_list sync_supers_timer; static int bdi_sync_supers(void *); static void sync_supers_timer_fn(unsigned long); +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) +{ + if (wb1 < wb2) { + spin_lock(&wb1->list_lock); + spin_lock_nested(&wb2->list_lock, 1); + } else { + spin_lock(&wb2->list_lock); + spin_lock_nested(&wb1->list_lock, 1); + } +} + #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -69,38 +74,46 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; + unsigned long nr_dirty, nr_io, nr_more_io; struct inode *inode; - nr_wb = nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_lock); + nr_dirty = nr_io = nr_more_io = 0; + spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_lock); + spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, - "BdiWriteback: %8lu kB\n" - "BdiReclaimable: %8lu kB\n" - "BdiDirtyThresh: %8lu kB\n" - "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n" - "b_dirty: %8lu\n" - "b_io: %8lu\n" - "b_more_io: %8lu\n" - "bdi_list: %8u\n" - "state: %8lx\n", + "BdiWriteback: %10lu kB\n" + "BdiReclaimable: %10lu kB\n" + "BdiDirtyThresh: %10lu kB\n" + "DirtyThresh: %10lu kB\n" + "BackgroundThresh: %10lu kB\n" + "BdiWritten: %10lu kB\n" + "BdiWriteBandwidth: %10lu kBps\n" + "b_dirty: %10lu\n" + "b_io: %10lu\n" + "b_more_io: %10lu\n" + "bdi_list: %10u\n" + "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), K(dirty_thresh), - K(background_thresh), nr_dirty, nr_io, nr_more_io, + K(bdi_thresh), + K(dirty_thresh), + K(background_thresh), + (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), + (unsigned long) K(bdi->write_bandwidth), + nr_dirty, + nr_io, + nr_more_io, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -255,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void bdi_flush_io(struct backing_dev_info *bdi) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .range_cyclic = 1, - .nr_to_write = 1024, - }; - - writeback_inodes_wb(&bdi->wb, &wbc); -} - /* * kupdated() used to do this. We cannot do it from the bdi_forker_thread() * or we risk deadlocking on ->s_umount. The longer term solution would be @@ -452,9 +453,10 @@ static int bdi_forker_thread(void *ptr) if (IS_ERR(task)) { /* * If thread creation fails, force writeout of - * the bdi from the thread. + * the bdi from the thread. Hopefully 1024 is + * large enough for efficient IO. */ - bdi_flush_io(bdi); + writeback_inodes_wb(&bdi->wb, 1024); } else { /* * The spinlock makes sure we do not lose @@ -511,7 +513,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); - synchronize_rcu(); + synchronize_rcu_expedited(); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -604,7 +606,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_bdi == bdi) - sb->s_bdi = NULL; + sb->s_bdi = &default_backing_dev_info; } spin_unlock(&sb_lock); } @@ -612,6 +614,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); del_timer_sync(&bdi->wb.wakeup_timer); @@ -634,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + spin_lock_init(&wb->list_lock); setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } +/* + * Initial write bandwidth: 100 MB/s + */ +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -659,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi) } bdi->dirty_exceeded = 0; + + bdi->bw_time_stamp = jiffies; + bdi->written_stamp = 0; + + bdi->write_bandwidth = INIT_BW; + bdi->avg_write_bandwidth = INIT_BW; + err = prop_local_init_percpu(&bdi->completions); if (err) { @@ -682,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_lock); + bdi_lock_two(&bdi->wb, dst); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_lock); + spin_unlock(&bdi->wb.list_lock); + spin_unlock(&dst->list_lock); } bdi_unregister(bdi); @@ -793,7 +810,7 @@ EXPORT_SYMBOL(congestion_wait); * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absense of zone congestion, cond_resched() is called to yield + * In the absence of zone congestion, cond_resched() is called to yield * the processor if necessary but otherwise does not sleep. * * The return value is 0 if the sleep is for the full timeout. Otherwise, diff --git a/mm/bootmem.c b/mm/bootmem.c index 13b0caa9793..01d5a4b3dd0 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -23,19 +23,17 @@ #include "internal.h" +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data = { + .bdata = &bootmem_node_data[0] +}; +EXPORT_SYMBOL(contig_page_data); +#endif + unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -#ifdef CONFIG_CRASH_DUMP -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; -#endif - -#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -146,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } -#endif + /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range @@ -171,53 +169,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) } } -#ifdef CONFIG_NO_BOOTMEM -static void __init __free_pages_memory(unsigned long start, unsigned long end) -{ - int i; - unsigned long start_aligned, end_aligned; - int order = ilog2(BITS_PER_LONG); - - start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); - end_aligned = end & ~(BITS_PER_LONG - 1); - - if (end_aligned <= start_aligned) { - for (i = start; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - return; - } - - for (i = start; i < start_aligned; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) - __free_pages_bootmem(pfn_to_page(i), order); - - for (i = end_aligned; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); -} - -unsigned long __init free_all_memory_core_early(int nodeid) -{ - int i; - u64 start, end; - unsigned long count = 0; - struct range *range = NULL; - int nr_range; - - nr_range = get_free_all_memory_range(&range, nodeid); - - for (i = 0; i < nr_range; i++) { - start = range[i].start; - end = range[i].end; - count += end - start; - __free_pages_memory(start, end); - } - - return count; -} -#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -278,7 +229,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -289,12 +239,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); -#ifdef CONFIG_NO_BOOTMEM - /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; -#else return free_all_bootmem_core(pgdat->bdata); -#endif } /** @@ -304,16 +249,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { -#ifdef CONFIG_NO_BOOTMEM - /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesnt have RAM installed - * low ram will be on Node1 - * Use MAX_NUMNODES will make sure all ranges in early_node_map[] - * will be used instead of only Node0 related - */ - return free_all_memory_core_early(MAX_NUMNODES); -#else unsigned long total_pages = 0; bootmem_data_t *bdata; @@ -321,10 +256,8 @@ unsigned long __init free_all_bootmem(void) total_pages += free_all_bootmem_core(bdata); return total_pages; -#endif } -#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -419,7 +352,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } -#endif /** * free_bootmem_node - mark a page range as usable @@ -434,10 +366,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(physaddr), size); - memblock_x86_free_range(physaddr, physaddr + size); -#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -446,7 +374,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); -#endif } /** @@ -460,10 +387,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(addr), size); - memblock_x86_free_range(addr, addr + size); -#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -472,7 +395,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); -#endif } /** @@ -489,17 +411,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); -#endif } /** @@ -515,20 +432,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); -#endif } -#ifndef CONFIG_NO_BOOTMEM int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { @@ -685,33 +596,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } -#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { -#ifdef CONFIG_NO_BOOTMEM - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - -restart: - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); - - if (ptr) - return ptr; - - if (goal != 0) { - goal = 0; - goto restart; - } - - return NULL; -#else bootmem_data_t *bdata; void *region; @@ -737,7 +627,6 @@ restart: } return NULL; -#endif } /** @@ -758,10 +647,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem_nopanic(size, align, goal, limit); } @@ -798,14 +683,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem(size, align, goal, limit); } -#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -822,7 +702,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } -#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -842,24 +721,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, -1ULL); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); -#endif - - return ptr; + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -880,13 +745,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long new_goal; new_goal = MAX_DMA32_PFN << PAGE_SHIFT; -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - new_goal, -1ULL); -#else ptr = alloc_bootmem_core(pgdat->bdata, size, align, new_goal, 0); -#endif if (ptr) return ptr; } @@ -907,16 +767,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { -#ifdef CONFIG_NO_BOOTMEM - unsigned long pfn, goal, limit; - - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; - - return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, - SMP_CACHE_BYTES, goal, limit); -#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -926,7 +776,6 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); -#endif } #endif @@ -938,16 +787,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); -#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); -#endif if (ptr) return ptr; @@ -995,21 +839,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#endif - return ptr; } diff --git a/mm/cleancache.c b/mm/cleancache.c new file mode 100644 index 00000000000..bcaae4c2a77 --- /dev/null +++ b/mm/cleancache.c @@ -0,0 +1,244 @@ +/* + * Cleancache frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of cleancache. See + * Documentation/vm/cleancache.txt for more information. + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/exportfs.h> +#include <linux/mm.h> +#include <linux/cleancache.h> + +/* + * This global enablement flag may be read thousands of times per second + * by cleancache_get/put/flush even on systems where cleancache_ops + * is not claimed (e.g. cleancache is config'ed on but remains + * disabled), so is preferred to the slower alternative: a function + * call that checks a non-global. + */ +int cleancache_enabled; +EXPORT_SYMBOL(cleancache_enabled); + +/* + * cleancache_ops is set by cleancache_ops_register to contain the pointers + * to the cleancache "backend" implementation functions. + */ +static struct cleancache_ops cleancache_ops; + +/* useful stats available in /sys/kernel/mm/cleancache */ +static unsigned long cleancache_succ_gets; +static unsigned long cleancache_failed_gets; +static unsigned long cleancache_puts; +static unsigned long cleancache_flushes; + +/* + * register operations for cleancache, returning previous thus allowing + * detection of multiple backends and possible nesting + */ +struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +{ + struct cleancache_ops old = cleancache_ops; + + cleancache_ops = *ops; + cleancache_enabled = 1; + return old; +} +EXPORT_SYMBOL(cleancache_register_ops); + +/* Called by a cleancache-enabled filesystem at time of mount */ +void __cleancache_init_fs(struct super_block *sb) +{ + sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_fs); + +/* Called by a cleancache-enabled clustered filesystem at time of mount */ +void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +{ + sb->cleancache_poolid = + (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_shared_fs); + +/* + * If the filesystem uses exportable filehandles, use the filehandle as + * the key, else use the inode number. + */ +static int cleancache_get_key(struct inode *inode, + struct cleancache_filekey *key) +{ + int (*fhfn)(struct dentry *, __u32 *fh, int *, int); + int len = 0, maxlen = CLEANCACHE_KEY_MAX; + struct super_block *sb = inode->i_sb; + + key->u.ino = inode->i_ino; + if (sb->s_export_op != NULL) { + fhfn = sb->s_export_op->encode_fh; + if (fhfn) { + struct dentry d; + d.d_inode = inode; + len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); + if (len <= 0 || len == 255) + return -1; + if (maxlen > CLEANCACHE_KEY_MAX) + return -1; + } + } + return 0; +} + +/* + * "Get" data from cleancache associated with the poolid/inode/index + * that were specified when the data was put to cleanache and, if + * successful, use it to fill the specified page with data and return 0. + * The pageframe is unchanged and returns -1 if the get fails. + * Page must be locked by caller. + */ +int __cleancache_get_page(struct page *page) +{ + int ret = -1; + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) + goto out; + + if (cleancache_get_key(page->mapping->host, &key) < 0) + goto out; + + ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + if (ret == 0) + cleancache_succ_gets++; + else + cleancache_failed_gets++; +out: + return ret; +} +EXPORT_SYMBOL(__cleancache_get_page); + +/* + * "Put" data from a page to cleancache and associate it with the + * (previously-obtained per-filesystem) poolid and the page's, + * inode and page index. Page must be locked. Note that a put_page + * always "succeeds", though a subsequent get_page may succeed or fail. + */ +void __cleancache_put_page(struct page *page) +{ + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id >= 0 && + cleancache_get_key(page->mapping->host, &key) >= 0) { + (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_puts++; + } +} +EXPORT_SYMBOL(__cleancache_put_page); + +/* + * Flush any data from cleancache associated with the poolid and the + * page's inode and page index so that a subsequent "get" will fail. + */ +void __cleancache_flush_page(struct address_space *mapping, struct page *page) +{ + /* careful... page->mapping is NULL sometimes when this is called */ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0) { + VM_BUG_ON(!PageLocked(page)); + if (cleancache_get_key(mapping->host, &key) >= 0) { + (*cleancache_ops.flush_page)(pool_id, key, page->index); + cleancache_flushes++; + } + } +} +EXPORT_SYMBOL(__cleancache_flush_page); + +/* + * Flush all data from cleancache associated with the poolid and the + * mappings's inode so that all subsequent gets to this poolid/inode + * will fail. + */ +void __cleancache_flush_inode(struct address_space *mapping) +{ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) + (*cleancache_ops.flush_inode)(pool_id, key); +} +EXPORT_SYMBOL(__cleancache_flush_inode); + +/* + * Called by any cleancache-enabled filesystem at time of unmount; + * note that pool_id is surrendered and may be reutrned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs + */ +void __cleancache_flush_fs(struct super_block *sb) +{ + if (sb->cleancache_poolid >= 0) { + int old_poolid = sb->cleancache_poolid; + sb->cleancache_poolid = -1; + (*cleancache_ops.flush_fs)(old_poolid); + } +} +EXPORT_SYMBOL(__cleancache_flush_fs); + +#ifdef CONFIG_SYSFS + +/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ + +#define CLEANCACHE_SYSFS_RO(_name) \ + static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%lu\n", cleancache_##_name); \ + } \ + static struct kobj_attribute cleancache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = cleancache_##_name##_show, \ + } + +CLEANCACHE_SYSFS_RO(succ_gets); +CLEANCACHE_SYSFS_RO(failed_gets); +CLEANCACHE_SYSFS_RO(puts); +CLEANCACHE_SYSFS_RO(flushes); + +static struct attribute *cleancache_attrs[] = { + &cleancache_succ_gets_attr.attr, + &cleancache_failed_gets_attr.attr, + &cleancache_puts_attr.attr, + &cleancache_flushes_attr.attr, + NULL, +}; + +static struct attribute_group cleancache_attr_group = { + .attrs = cleancache_attrs, + .name = "cleancache", +}; + +#endif /* CONFIG_SYSFS */ + +static int __init init_cleancache(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &cleancache_attr_group); +#endif /* CONFIG_SYSFS */ + return 0; +} +module_init(init_cleancache) diff --git a/mm/compaction.c b/mm/compaction.c index 8be430b812d..6cc604bd564 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,8 +42,6 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - - int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -146,16 +144,26 @@ static void isolate_freepages(struct zone *zone, int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; + /* + * Initialise the free scanner. The starting point is where we last + * scanned from (or the end of the zone if starting). The low point + * is the end of the pageblock the migration scanner is using. + */ pfn = cc->free_pfn; low_pfn = cc->migrate_pfn + pageblock_nr_pages; - high_pfn = low_pfn; + + /* + * Take care that if the migration scanner is at the end of the zone + * that the free scanner does not accidentally move to the next zone + * in the next isolation cycle. + */ + high_pfn = min(low_pfn, pfn); /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - spin_lock_irqsave(&zone->lock, flags); for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; @@ -178,9 +186,19 @@ static void isolate_freepages(struct zone *zone, if (!suitable_migration_target(page)) continue; - /* Found a block suitable for isolating free pages from */ - isolated = isolate_freepages_block(zone, pfn, freelist); - nr_freepages += isolated; + /* + * Found a block suitable for isolating free pages from. Now + * we disabled interrupts, double check things are ok and + * isolate the pages. This is to minimise the time IRQs + * are disabled + */ + isolated = 0; + spin_lock_irqsave(&zone->lock, flags); + if (suitable_migration_target(page)) { + isolated = isolate_freepages_block(zone, pfn, freelist); + nr_freepages += isolated; + } + spin_unlock_irqrestore(&zone->lock, flags); /* * Record the highest PFN we isolated pages from. When next @@ -190,7 +208,6 @@ static void isolate_freepages(struct zone *zone, if (isolated) high_pfn = max(high_pfn, pfn); } - spin_unlock_irqrestore(&zone->lock, flags); /* split_free_page does not map the pages */ list_for_each_entry(page, freelist, lru) { @@ -234,11 +251,18 @@ static bool too_many_isolated(struct zone *zone) return isolated > (inactive + active) / 2; } +/* possible outcome of isolate_migratepages */ +typedef enum { + ISOLATE_ABORT, /* Abort compaction now */ + ISOLATE_NONE, /* No pages isolated, continue scanning */ + ISOLATE_SUCCESS, /* Pages isolated, migrate */ +} isolate_migrate_t; + /* * Isolate all pages that can be migrated from the block pointed to by * the migrate scanner within compact_control. */ -static unsigned long isolate_migratepages(struct zone *zone, +static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; @@ -255,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone, /* Do not cross the free scanner or scan within a memory hole */ if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { cc->migrate_pfn = end_pfn; - return 0; + return ISOLATE_NONE; } /* @@ -264,16 +288,38 @@ static unsigned long isolate_migratepages(struct zone *zone, * delay for some time until fewer pages are isolated */ while (unlikely(too_many_isolated(zone))) { + /* async migration should just abort */ + if (!cc->sync) + return ISOLATE_ABORT; + congestion_wait(BLK_RW_ASYNC, HZ/10); if (fatal_signal_pending(current)) - return 0; + return ISOLATE_ABORT; } /* Time to isolate some pages for migration */ + cond_resched(); spin_lock_irq(&zone->lru_lock); for (; low_pfn < end_pfn; low_pfn++) { struct page *page; + bool locked = true; + + /* give a chance to irqs before checking need_resched() */ + if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { + spin_unlock_irq(&zone->lru_lock); + locked = false; + } + if (need_resched() || spin_is_contended(&zone->lru_lock)) { + if (locked) + spin_unlock_irq(&zone->lru_lock); + cond_resched(); + spin_lock_irq(&zone->lru_lock); + if (fatal_signal_pending(current)) + break; + } else if (!locked) + spin_lock_irq(&zone->lru_lock); + if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; @@ -334,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone, trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); - return cc->nr_migratepages; + return ISOLATE_SUCCESS; } /* @@ -396,16 +442,6 @@ static int compact_finished(struct zone *zone, if (cc->free_pfn <= cc->migrate_pfn) return COMPACT_COMPLETE; - /* Compaction run is not finished if the watermark is not met */ - if (cc->compact_mode != COMPACT_MODE_KSWAPD) - watermark = low_wmark_pages(zone); - else - watermark = high_wmark_pages(zone); - watermark += (1 << cc->order); - - if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) - return COMPACT_CONTINUE; - /* * order == -1 is expected when compacting via * /proc/sys/vm/compact_memory @@ -413,13 +449,11 @@ static int compact_finished(struct zone *zone, if (cc->order == -1) return COMPACT_CONTINUE; - /* - * Generating only one page of the right order is not enough - * for kswapd, we must continue until we're above the high - * watermark as a pool for high order GFP_ATOMIC allocations - * too. - */ - if (cc->compact_mode == COMPACT_MODE_KSWAPD) + /* Compaction run is not finished if the watermark is not met */ + watermark = low_wmark_pages(zone); + watermark += (1 << cc->order); + + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ @@ -449,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order) unsigned long watermark; /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ + if (order == -1) + return COMPACT_CONTINUE; + + /* * Watermarks for order-0 must be met for compaction. Note the 2UL. * This is because during migration, copies of pages need to be * allocated and for a short time, the footprint is higher @@ -458,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_SKIPPED; /* - * order == -1 is expected when compacting via - * /proc/sys/vm/compact_memory - */ - if (order == -1) - return COMPACT_CONTINUE; - - /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation * - * index of -1 implies allocations might succeed dependingon watermarks + * index of -1000 implies allocations might succeed depending on + * watermarks * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * @@ -478,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) return COMPACT_SKIPPED; - if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) + if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, + 0, 0)) return COMPACT_PARTIAL; return COMPACT_CONTINUE; @@ -508,12 +544,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { unsigned long nr_migrate, nr_remaining; + int err; - if (!isolate_migratepages(zone, cc)) + switch (isolate_migratepages(zone, cc)) { + case ISOLATE_ABORT: + ret = COMPACT_PARTIAL; + goto out; + case ISOLATE_NONE: continue; + case ISOLATE_SUCCESS: + ; + } nr_migrate = cc->nr_migratepages; - migrate_pages(&cc->migratepages, compaction_alloc, + err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, cc->sync); update_nr_listpages(cc); @@ -527,13 +571,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_remaining); /* Release LRU pages not migrated */ - if (!list_empty(&cc->migratepages)) { + if (err) { putback_lru_pages(&cc->migratepages); cc->nr_migratepages = 0; } } +out: /* Release free pages and check accounting */ cc->nr_freepages -= release_freepages(&cc->freepages); VM_BUG_ON(cc->nr_freepages != 0); @@ -543,8 +588,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, - int compact_mode) + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -553,7 +597,6 @@ unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -599,8 +642,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, - COMPACT_MODE_DIRECT_RECLAIM); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -631,7 +673,6 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, - .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/dmapool.c b/mm/dmapool.c index 03bf3bb4519..fbb58e34688 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool) { struct device *dev = pool->dev; - dma_pool_destroy(pool); WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); + dma_pool_destroy(pool); } EXPORT_SYMBOL(dmam_pool_destroy); diff --git a/mm/failslab.c b/mm/failslab.c index c5f88f240dd..0dd7b8fec71 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -5,10 +5,6 @@ static struct { struct fault_attr attr; u32 ignore_gfp_wait; int cache_filter; -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - struct dentry *ignore_gfp_wait_file; - struct dentry *cache_filter_file; -#endif } failslab = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, @@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init failslab_debugfs_init(void) { - mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; - int err; - - err = init_fault_attr_dentries(&failslab.attr, "failslab"); - if (err) - return err; - dir = failslab.attr.dentries.dir; + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; - failslab.ignore_gfp_wait_file = - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_wait); + dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); - failslab.cache_filter_file = - debugfs_create_bool("cache-filter", mode, dir, - &failslab.cache_filter); + if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_wait)) + goto fail; + if (!debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter)) + goto fail; - if (!failslab.ignore_gfp_wait_file || - !failslab.cache_filter_file) { - err = -ENOMEM; - debugfs_remove(failslab.cache_filter_file); - debugfs_remove(failslab.ignore_gfp_wait_file); - cleanup_fault_attr_dentries(&failslab.attr); - } + return 0; +fail: + debugfs_remove_recursive(dir); - return err; + return -ENOMEM; } late_initcall(failslab_debugfs_init); diff --git a/mm/filemap.c b/mm/filemap.c index 83a45d35468..645a080ba4d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,7 +33,7 @@ #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> -#include <linux/mm_inline.h> /* for page_is_file_cache() */ +#include <linux/cleancache.h> #include "internal.h" /* @@ -58,16 +58,16 @@ /* * Lock ordering: * - * ->i_mmap_lock (truncate_pagecache) + * ->i_mmap_mutex (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_lock (truncate->unmap_mapping_range) + * ->i_mmap_mutex (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_lock + * ->i_mmap_mutex * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -77,14 +77,11 @@ * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * - * ->i_mutex - * ->i_alloc_sem (various) - * - * ->inode_lock - * ->sb_lock (fs/fs-writeback.c) + * bdi->wb.list_lock + * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_lock + * ->i_mmap_mutex * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock @@ -98,26 +95,39 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (zap_pte_range->set_page_dirty) + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) + * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) + * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * * (code doesn't rely on that order, so you could switch it around) * ->tasklist_lock (memory_failure, collect_procs_ao) - * ->i_mmap_lock + * ->i_mmap_mutex */ /* - * Remove a page from the page cache and free it. Caller has to make + * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */ -void __remove_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page) && PageMappedToDisk(page)) + cleancache_put_page(page); + else + cleancache_flush_page(mapping, page); + radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) @@ -137,7 +147,15 @@ void __remove_from_page_cache(struct page *page) } } -void remove_from_page_cache(struct page *page) +/** + * delete_from_page_cache - delete page from page cache + * @page: the page which the kernel is trying to remove from page cache + * + * This must be called only on pages that have been verified to be in the page + * cache and locked. It will never put the page into the free list, the caller + * has a reference on the page. + */ +void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; void (*freepage)(struct page *); @@ -146,54 +164,25 @@ void remove_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); if (freepage) freepage(page); + page_cache_release(page); } -EXPORT_SYMBOL(remove_from_page_cache); +EXPORT_SYMBOL(delete_from_page_cache); -static int sync_page(void *word) +static int sleep_on_page(void *word) { - struct address_space *mapping; - struct page *page; - - page = container_of((unsigned long *)word, struct page, flags); - - /* - * page_mapping() is being called without PG_locked held. - * Some knowledge of the state and use of the page is used to - * reduce the requirements down to a memory barrier. - * The danger here is of a stale page_mapping() return value - * indicating a struct address_space different from the one it's - * associated with when it is associated with one. - * After smp_mb(), it's either the correct page_mapping() for - * the page, or an old page_mapping() and the page's own - * page_mapping() has gone NULL. - * The ->sync_page() address_space operation must tolerate - * page_mapping() going NULL. By an amazing coincidence, - * this comes about because none of the users of the page - * in the ->sync_page() methods make essential use of the - * page_mapping(), merely passing the page down to the backing - * device's unplug functions when it's non-NULL, which in turn - * ignore it for all cases but swap, where only page_private(page) is - * of interest. When page_mapping() does go NULL, the entire - * call stack gracefully ignores the page and returns. - * -- wli - */ - smp_mb(); - mapping = page_mapping(page); - if (mapping && mapping->a_ops && mapping->a_ops->sync_page) - mapping->a_ops->sync_page(page); io_schedule(); return 0; } -static int sync_page_killable(void *word) +static int sleep_on_page_killable(void *word) { - sync_page(word); + sleep_on_page(word); return fatal_signal_pending(current) ? -EINTR : 0; } @@ -387,6 +376,76 @@ int filemap_write_and_wait_range(struct address_space *mapping, EXPORT_SYMBOL(filemap_write_and_wait_range); /** + * replace_page_cache_page - replace a pagecache page with a new one + * @old: page to be replaced + * @new: page to replace with + * @gfp_mask: allocation mode + * + * This function replaces a page in the pagecache with a new one. On + * success it acquires the pagecache reference for the new page and + * drops it for the old page. Both the old and new pages must be + * locked. This function does not add the new page to the LRU, the + * caller must do that. + * + * The remove + add is atomic. The only way this function can fail is + * memory allocation failure. + */ +int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) +{ + int error; + struct mem_cgroup *memcg = NULL; + + VM_BUG_ON(!PageLocked(old)); + VM_BUG_ON(!PageLocked(new)); + VM_BUG_ON(new->mapping); + + /* + * This is not page migration, but prepare_migration and + * end_migration does enough work for charge replacement. + * + * In the longer term we probably want a specialized function + * for moving the charge from old to new in a more efficient + * manner. + */ + error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); + if (error) + return error; + + error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + if (!error) { + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *); + + pgoff_t offset = old->index; + freepage = mapping->a_ops->freepage; + + page_cache_get(new); + new->mapping = mapping; + new->index = offset; + + spin_lock_irq(&mapping->tree_lock); + __delete_from_page_cache(old); + error = radix_tree_insert(&mapping->page_tree, offset, new); + BUG_ON(error); + mapping->nrpages++; + __inc_zone_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(new)) + __inc_zone_page_state(new, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + radix_tree_preload_end(); + if (freepage) + freepage(old); + page_cache_release(old); + mem_cgroup_end_migration(memcg, old, new, true); + } else { + mem_cgroup_end_migration(memcg, old, new, false); + } + + return error; +} +EXPORT_SYMBOL_GPL(replace_page_cache_page); + +/** * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space @@ -402,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int error; VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageSwapBacked(page)); error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); @@ -419,11 +479,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - if (PageSwapBacked(page)) - __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); @@ -441,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, { int ret; - /* - * Splice_read and readahead add shmem/tmpfs pages into the page cache - * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the anon lru below, and mem_cgroup_cache_charge - * (called in add_to_page_cache) needs to know where they're going too. - */ - if (mapping_cap_swap_backed(mapping)) - SetPageSwapBacked(page); - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) { - if (page_is_file_cache(page)) - lru_cache_add_file(page); - else - lru_cache_add_anon(page); - } + if (ret == 0) + lru_cache_add_file(page); return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); @@ -479,12 +525,6 @@ struct page *__page_cache_alloc(gfp_t gfp) EXPORT_SYMBOL(__page_cache_alloc); #endif -static int __sleep_on_page_lock(void *word) -{ - io_schedule(); - return 0; -} - /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -512,11 +552,22 @@ void wait_on_page_bit(struct page *page, int bit_nr) DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sync_page, + __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); +int wait_on_page_bit_killable(struct page *page, int bit_nr) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + if (!test_bit(bit_nr, &page->flags)) + return 0; + + return __wait_on_bit(page_waitqueue(page), &wait, + sleep_on_page_killable, TASK_KILLABLE); +} + /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue * @page: Page defining the wait queue of interest @@ -576,17 +627,12 @@ EXPORT_SYMBOL(end_page_writeback); /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock - * - * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some - * random driver's requestfn sets TASK_RUNNING, we could busywait. However - * chances are that on the second loop, the block layer's plug list is empty, - * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); @@ -596,34 +642,39 @@ int __lock_page_killable(struct page *page) DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); return __wait_on_bit_lock(page_waitqueue(page), &wait, - sync_page_killable, TASK_KILLABLE); + sleep_on_page_killable, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); -/** - * __lock_page_nosync - get a lock on the page, without calling sync_page() - * @page: the page to lock - * - * Variant of lock_page that does not require the caller to hold a reference - * on the page's mapping. - */ -void __lock_page_nosync(struct page *page) -{ - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, - TASK_UNINTERRUPTIBLE); -} - int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { - if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { - __lock_page(page); - return 1; - } else { + if (flags & FAULT_FLAG_ALLOW_RETRY) { + /* + * CAUTION! In this case, mmap_sem is not released + * even though return 0. + */ + if (flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; + up_read(&mm->mmap_sem); - wait_on_page_locked(page); + if (flags & FAULT_FLAG_KILLABLE) + wait_on_page_locked_killable(page); + else + wait_on_page_locked(page); return 0; + } else { + if (flags & FAULT_FLAG_KILLABLE) { + int ret; + + ret = __lock_page_killable(page); + if (ret) { + up_read(&mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; } } @@ -648,9 +699,16 @@ repeat: page = radix_tree_deref_slot(pagep); if (unlikely(!page)) goto out; - if (radix_tree_deref_retry(page)) - goto repeat; - + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto repeat; + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + goto out; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -687,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_page(mapping, offset); - if (page) { + if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { @@ -774,7 +832,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, start, nr_pages); + (void ***)pages, NULL, start, nr_pages); ret = 0; for (i = 0; i < nr_found; i++) { struct page *page; @@ -782,10 +840,23 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - if (radix_tree_deref_retry(page)) { - if (ret) - start = pages[ret-1]->index; - goto restart; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + WARN_ON(start | i); + goto restart; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so skip over it - + * we only reach this from invalidate_mapping_pages(). + */ + continue; } if (!page_cache_get_speculative(page)) @@ -800,6 +871,13 @@ repeat: pages[ret] = page; ret++; } + + /* + * If all entries were removed before we could secure them, + * try again, because callers stop trying once 0 is returned. + */ + if (unlikely(!ret && nr_found)) + goto restart; rcu_read_unlock(); return ret; } @@ -826,7 +904,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, index, nr_pages); + (void ***)pages, NULL, index, nr_pages); ret = 0; for (i = 0; i < nr_found; i++) { struct page *page; @@ -834,8 +912,23 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - if (radix_tree_deref_retry(page)) - goto restart; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so stop looking for + * contiguous pages. + */ + break; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -894,8 +987,22 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - if (radix_tree_deref_retry(page)) - goto restart; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * This function is never used on a shmem/tmpfs + * mapping, so a swap entry won't be found here. + */ + BUG(); + } if (!page_cache_get_speculative(page)) goto repeat; @@ -909,6 +1016,13 @@ repeat: pages[ret] = page; ret++; } + + /* + * If all entries were removed before we could secure them, + * try again, because callers stop trying once 0 is returned. + */ + if (unlikely(!ret && nr_found)) + goto restart; rcu_read_unlock(); if (ret) @@ -1298,12 +1412,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; + struct blk_plug plug; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; + blk_start_plug(&plug); + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { loff_t size; @@ -1376,6 +1493,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, break; } out: + blk_finish_plug(&plug); return retval; } EXPORT_SYMBOL(generic_file_aio_read); @@ -1468,15 +1586,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, /* If we don't want any read-ahead, don't bother */ if (VM_RandomReadHint(vma)) return; + if (!ra->ra_pages) + return; - if (VM_SequentialReadHint(vma) || - offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { + if (VM_SequentialReadHint(vma)) { page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); return; } - if (ra->mmap_miss < INT_MAX) + /* Avoid banging the cache line if not needed */ + if (ra->mmap_miss < MMAP_LOTSAMISS * 10) ra->mmap_miss++; /* @@ -1490,12 +1610,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, * mmap read-around */ ra_pages = max_sane_readahead(ra->ra_pages); - if (ra_pages) { - ra->start = max_t(long, 0, offset - ra_pages/2); - ra->size = ra_pages; - ra->async_size = 0; - ra_submit(ra, mapping, file); - } + ra->start = max_t(long, 0, offset - ra_pages / 2); + ra->size = ra_pages; + ra->async_size = ra_pages / 4; + ra_submit(ra, mapping, file); } /* @@ -1562,6 +1680,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); @@ -1600,7 +1719,6 @@ retry_find: return VM_FAULT_SIGBUS; } - ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1696,7 +1814,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) { @@ -1727,7 +1845,7 @@ repeat: static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) @@ -1767,7 +1885,7 @@ out: * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Same as read_cache_page, but don't wait for page to become unlocked * after submitting it to the filler. @@ -1779,7 +1897,7 @@ out: */ struct page *read_cache_page_async(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -1827,7 +1945,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Read into the page cache. If a page already exists, and PageUptodate() is * not set, try to fill the page then wait for it to become unlocked. @@ -1836,7 +1954,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); @@ -1883,16 +2001,26 @@ static int __remove_suid(struct dentry *dentry, int kill) int file_remove_suid(struct file *file) { struct dentry *dentry = file->f_path.dentry; - int killsuid = should_remove_suid(dentry); - int killpriv = security_inode_need_killpriv(dentry); + struct inode *inode = dentry->d_inode; + int killsuid; + int killpriv; int error = 0; + /* Fast path for nothing security related */ + if (IS_NOSEC(inode)) + return 0; + + killsuid = should_remove_suid(dentry); + killpriv = security_inode_need_killpriv(dentry); + if (killpriv < 0) return killpriv; if (killpriv) error = security_inode_killpriv(dentry); if (!error && killsuid) error = __remove_suid(dentry, killsuid); + if (!error && (inode->i_sb->s_flags & MS_NOSEC)) + inode->i_flags |= S_NOSEC; return error; } @@ -2228,7 +2356,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, repeat: page = find_lock_page(mapping, index); if (page) - return page; + goto found; page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); if (!page) @@ -2241,6 +2369,8 @@ repeat: goto repeat; return NULL; } +found: + wait_on_page_writeback(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); @@ -2487,11 +2617,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); + blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2502,6 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } + blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(generic_file_aio_write); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 83364df74a3..93356cd1282 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping, return; retry: - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + @@ -201,7 +201,7 @@ retry: page_cache_release(page); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); if (locked) { mutex_unlock(&xip_sparse_mutex); diff --git a/mm/fremap.c b/mm/fremap.c index ec520c7b28d..b8e0e2d468a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -211,20 +211,20 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } goto out; } - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; vma_prio_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } if (vma->vm_flags & VM_LOCKED) { /* * drop PG_Mlocked flag for over-mapped range */ - unsigned int saved_flags = vma->vm_flags; + vm_flags_t saved_flags = vma->vm_flags; munlock_vma_pages_range(vma, start, start + size); vma->vm_flags = saved_flags; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3e29781ee76..e2d1587be26 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -244,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag) { - if (test_bit(flag, &transparent_hugepage_flags)) - return sprintf(buf, "[yes] no\n"); - else - return sprintf(buf, "yes [no]\n"); + return sprintf(buf, "%d\n", + !!test_bit(flag, &transparent_hugepage_flags)); } + static ssize_t single_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag) { - if (!memcmp("yes", buf, - min(sizeof("yes")-1, count))) { + unsigned long value; + int ret; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > 1) + return -EINVAL; + + if (value) set_bit(flag, &transparent_hugepage_flags); - } else if (!memcmp("no", buf, - min(sizeof("no")-1, count))) { + else clear_bit(flag, &transparent_hugepage_flags); - } else - return -EINVAL; return count; } @@ -643,23 +647,24 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return ret; } -static inline gfp_t alloc_hugepage_gfpmask(int defrag) +static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; } static inline struct page *alloc_hugepage_vma(int defrag, struct vm_area_struct *vma, - unsigned long haddr) + unsigned long haddr, int nd, + gfp_t extra_gfp) { - return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER, vma, haddr); + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), + HPAGE_PMD_ORDER, vma, haddr, nd); } #ifndef CONFIG_NUMA static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(alloc_hugepage_gfpmask(defrag), + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); } #endif @@ -678,9 +683,12 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); - if (unlikely(!page)) + vma, haddr, numa_node_id(), 0); + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK); goto out; + } + count_vm_event(THP_FAULT_ALLOC); if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { put_page(page); goto out; @@ -799,8 +807,9 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | + __GFP_OTHER_NODE, + vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_newpage_charge(pages[i], mm, GFP_KERNEL))) { @@ -902,16 +911,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id(), 0); else new_page = NULL; if (unlikely(!new_page)) { + count_vm_event(THP_FAULT_FALLBACK); ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); put_page(page); goto out; } + count_vm_event(THP_FAULT_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); @@ -1128,7 +1139,7 @@ static int __split_huge_page_splitting(struct page *page, * We can't temporarily set the pmd to null in order * to split it, the pmd must remain marked huge at all * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->lock to + * and it won't wait on the anon_vma->root->mutex to * serialize against split_huge_page*. */ pmdp_splitting_flush_notify(vma, address, pmd); @@ -1322,7 +1333,7 @@ static int __split_huge_page_map(struct page *page, return ret; } -/* must be called with anon_vma->root->lock hold */ +/* must be called with anon_vma->root->mutex hold */ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { @@ -1388,6 +1399,7 @@ int split_huge_page(struct page *page) BUG_ON(!PageSwapBacked(page)); __split_huge_page(page, anon_vma); + count_vm_event(THP_SPLIT); BUG_ON(PageCompound(page)); out_unlock: @@ -1396,6 +1408,9 @@ out: return ret; } +#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ + VM_HUGETLB|VM_SHARED|VM_MAYSHARE) + int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { @@ -1404,11 +1419,7 @@ int hugepage_madvise(struct vm_area_struct *vma, /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_HUGEPAGE | - VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) + if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; @@ -1424,11 +1435,7 @@ int hugepage_madvise(struct vm_area_struct *vma, /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_NOHUGEPAGE | - VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) + if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) return -EINVAL; *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; @@ -1562,10 +1569,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) * page fault if needed. */ return 0; - if (vma->vm_file || vma->vm_ops) + if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + /* + * If is_pfn_mapping() is true is_learn_pfn_mapping() must be + * true too, verify it here. + */ + VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -1585,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm) list_del(&mm_slot->mm_node); free = 1; } + spin_unlock(&khugepaged_mm_lock); if (free) { - spin_unlock(&khugepaged_mm_lock); clear_bit(MMF_VM_HUGEPAGE, &mm->flags); free_mm_slot(mm_slot); mmdrop(mm); } else if (mm_slot) { - spin_unlock(&khugepaged_mm_lock); /* * This is required to serialize against * khugepaged_test_exit() (which is guaranteed to run @@ -1603,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm) */ down_write(&mm->mmap_sem); up_write(&mm->mmap_sem); - } else - spin_unlock(&khugepaged_mm_lock); + } } static void release_pte_page(struct page *page) @@ -1745,7 +1754,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int node) { pgd_t *pgd; pud_t *pud; @@ -1759,6 +1769,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); #ifndef CONFIG_NUMA + up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); new_page = *hpage; #else @@ -1773,22 +1784,29 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + node, __GFP_OTHER_NODE); + + /* + * After allocating the hugepage, release the mmap_sem read lock in + * preparation for taking it in write mode. + */ + up_read(&mm->mmap_sem); if (unlikely(!new_page)) { - up_read(&mm->mmap_sem); + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); return; } #endif + + count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { - up_read(&mm->mmap_sem); +#ifdef CONFIG_NUMA put_page(new_page); +#endif return; } - /* after allocating the hugepage upgrade to mmap_sem write mode */ - up_read(&mm->mmap_sem); - /* * Prevent all access to pagetables with the exception of * gup_fast later hanlded by the ptep_clear_flush and the VM @@ -1808,12 +1826,15 @@ static void collapse_huge_page(struct mm_struct *mm, (vma->vm_flags & VM_NOHUGEPAGE)) goto out; - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + if (!vma->anon_vma || vma->vm_ops) goto out; if (is_vma_temporary_stack(vma)) goto out; - VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + /* + * If is_pfn_mapping() is true is_learn_pfn_mapping() must be + * true too, verify it here. + */ + VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -1919,6 +1940,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct page *page; unsigned long _address; spinlock_t *ptl; + int node = -1; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1949,6 +1971,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page = vm_normal_page(vma, _address, pteval); if (unlikely(!page)) goto out_unmap; + /* + * Chose the node of the first page. This could + * be more sophisticated and look at more pages, + * but isn't for now. + */ + if (node == -1) + node = page_to_nid(page); VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -1965,7 +1994,7 @@ out_unmap: pte_unmap_unlock(pte, ptl); if (ret) /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma); + collapse_huge_page(mm, address, hpage, vma, node); out: return ret; } @@ -2038,13 +2067,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; continue; } - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + if (!vma->anon_vma || vma->vm_ops) goto skip; if (is_vma_temporary_stack(vma)) goto skip; - - VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + /* + * If is_pfn_mapping() is true is_learn_pfn_mapping() + * must be true too, verify it here. + */ + VM_BUG_ON(is_linear_pfn_mapping(vma) || + vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; @@ -2135,8 +2167,11 @@ static void khugepaged_do_scan(struct page **hpage) #ifndef CONFIG_NUMA if (!*hpage) { *hpage = alloc_hugepage(khugepaged_defrag()); - if (unlikely(!*hpage)) + if (unlikely(!*hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); break; + } + count_vm_event(THP_COLLAPSE_ALLOC); } #else if (IS_ERR(*hpage)) @@ -2176,8 +2211,11 @@ static struct page *khugepaged_alloc_hugepage(void) do { hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); } while (unlikely(!hpage) && likely(khugepaged_enabled())); return hpage; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb0b7c12801..dae27ba3be2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,7 +24,7 @@ #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/io.h> +#include <linux/io.h> #include <linux/hugetlb.h> #include <linux/node.h> @@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock); * must either hold the mmap_sem for write, or the mmap_sem for read and * the hugetlb_instantiation mutex: * - * down_write(&mm->mmap_sem); + * down_write(&mm->mmap_sem); * or - * down_read(&mm->mmap_sem); - * mutex_lock(&hugetlb_instantiation_mutex); + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); */ struct file_region { struct list_head link; @@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t) if (rg->from > t) return chg; - /* We overlap with this area, if it extends futher than + /* We overlap with this area, if it extends further than * us then we must extend ourselves. Account for its * existing reservation. */ if (rg->to > t) { @@ -475,7 +475,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) - goto err;; + goto err; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { @@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page) h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { - page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | - 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1<< PG_writeback); + page[i].flags &= ~(1 << PG_locked | 1 << PG_error | + 1 << PG_referenced | 1 << PG_dirty | + 1 << PG_active | 1 << PG_reserved | + 1 << PG_private | 1 << PG_writeback); } set_compound_page_dtor(page, NULL); set_page_refcounted(page); @@ -591,7 +592,6 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } - EXPORT_SYMBOL_GPL(PageHuge); static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) @@ -842,7 +842,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) } /* - * Increase the hugetlb pool such that it can accomodate a reservation + * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. */ static int gather_surplus_pages(struct hstate *h, int delta) @@ -890,7 +890,7 @@ retry: /* * The surplus_list now contains _at_least_ the number of extra pages - * needed to accomodate the reservation. Add the appropriate number + * needed to accommodate the reservation. Add the appropriate number * of pages to the hugetlb pool and free the extras back to the buddy * allocator. Commit the entire reservation here to prevent another * process from stealing the pages as they are added to the pool but @@ -1033,10 +1033,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, */ chg = vma_needs_reservation(h, vma, addr); if (chg < 0) - return ERR_PTR(chg); + return ERR_PTR(-VM_FAULT_OOM); if (chg) if (hugetlb_get_quota(inode->i_mapping, chg)) - return ERR_PTR(-ENOSPC); + return ERR_PTR(-VM_FAULT_SIGBUS); spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); @@ -1105,12 +1105,28 @@ static void __init gather_bootmem_prealloc(void) struct huge_bootmem_page *m; list_for_each_entry(m, &huge_boot_pages, list) { - struct page *page = virt_to_page(m); struct hstate *h = m->hstate; + struct page *page; + +#ifdef CONFIG_HIGHMEM + page = pfn_to_page(m->phys >> PAGE_SHIFT); + free_bootmem_late((unsigned long)m, + sizeof(struct huge_bootmem_page)); +#else + page = virt_to_page(m); +#endif __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); prep_new_huge_page(h, page, page_to_nid(page)); + /* + * If we had gigantic hugepages allocated at boot time, we need + * to restore the 'stolen' pages to totalram_pages in order to + * fix confusing memory reports from free(1) and another + * side-effects, like CommitLimit going negative. + */ + if (h->order > (MAX_ORDER - 1)) + totalram_pages += 1 << h->order; } } @@ -1872,8 +1888,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; - if (!write) - tmp = h->max_huge_pages; + tmp = h->max_huge_pages; if (write && h->order >= MAX_ORDER) return -EINVAL; @@ -1938,8 +1953,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; - if (!write) - tmp = h->nr_overcommit_huge_pages; + tmp = h->nr_overcommit_huge_pages; if (write && h->order >= MAX_ORDER) return -EINVAL; @@ -2045,7 +2059,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA - * has a reference to the reservation map it cannot dissappear until + * has a reference to the reservation map it cannot disappear until * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ @@ -2118,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, pte_t entry; entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { + if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); - } } @@ -2175,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return 0; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) { + if (non_swap_entry(swp) && is_migration_entry(swp)) return 1; - } else + else return 0; } @@ -2188,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return 0; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) return 1; - } else + else return 0; } @@ -2207,7 +2220,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long sz = huge_page_size(h); /* - * A page gathering list, protected by per file i_mmap_lock. The + * A page gathering list, protected by per file i_mmap_mutex. The * lock is used to avoid list corruption from multiple unmapping * of the same page since we are using page->lru. */ @@ -2276,9 +2289,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); __unmap_hugepage_range(vma, start, end, ref_page); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } /* @@ -2310,7 +2323,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * this mapping should be shared between all the VMAs, * __unmap_hugepage_range() is called as the lock is already held */ - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) @@ -2328,7 +2341,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address, address + huge_page_size(h), page); } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return 1; } @@ -2492,7 +2505,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed - * COW. Warn that such a situation has occured as it may not be obvious + * COW. Warn that such a situation has occurred as it may not be obvious */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { printk(KERN_WARNING @@ -2553,7 +2566,7 @@ retry: * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON | + ret = VM_FAULT_HWPOISON | VM_FAULT_SET_HINDEX(h - hstates); goto backout_unlocked; } @@ -2621,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, migration_entry_wait(mm, (pmd_t *)ptep, address); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON_LARGE | + return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(h - hstates); } @@ -2812,7 +2825,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, address, end); - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { ptep = huge_pte_offset(mm, address); @@ -2827,7 +2840,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, } } spin_unlock(&mm->page_table_lock); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); flush_tlb_range(vma, start, end); } @@ -2835,7 +2848,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, - int acctflag) + vm_flags_t vm_flags) { long ret, chg; struct hstate *h = hstate_inode(inode); @@ -2845,7 +2858,7 @@ int hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * and filesystem quota without using reserves */ - if (acctflag & VM_NORESERVE) + if (vm_flags & VM_NORESERVE) return 0; /* diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 0948f1072d6..c7fc7fd00e3 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -1,4 +1,4 @@ -/* Inject a hwpoison memory failure on a arbitary pfn */ +/* Inject a hwpoison memory failure on a arbitrary pfn */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/kernel.h> diff --git a/mm/init-mm.c b/mm/init-mm.c index 1d29cdfe8eb..a56a851908d 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -5,7 +5,7 @@ #include <linux/list.h> #include <linux/cpumask.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -21,6 +21,5 @@ struct mm_struct init_mm = { .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), - .cpu_vm_mask = CPU_MASK_ALL, INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/internal.h b/mm/internal.h index 69488205723..d071d380fb4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -66,6 +66,10 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +/* mm/util.c */ +void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent); + #ifdef CONFIG_MMU extern long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -162,7 +166,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset) } /* - * Iterator over all subpages withing the maximally aligned gigantic + * Iterator over all subpages within the maximally aligned gigantic * page 'base'. Handle any discontiguity in the mem_map. */ static inline struct page *mem_map_next(struct page *iter, @@ -245,11 +249,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking); - #define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_SOME 0 diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 84225f3b719..d6880f542f9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -96,7 +96,7 @@ #include <asm/sections.h> #include <asm/processor.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/kmemcheck.h> #include <linux/kmemleak.h> @@ -265,7 +265,7 @@ static void kmemleak_disable(void); } while (0) /* - * Macro invoked when a serious kmemleak condition occured and cannot be + * Macro invoked when a serious kmemleak condition occurred and cannot be * recovered from. Kmemleak will be disabled and further allocation/freeing * tracing no longer available. */ @@ -1006,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object) /* * Memory scanning is a long process and it needs to be interruptable. This - * function checks whether such interrupt condition occured. + * function checks whether such interrupt condition occurred. */ static int scan_should_stop(void) { @@ -1414,9 +1414,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++(*pos); list_for_each_continue_rcu(n, &object_list) { - next_obj = list_entry(n, struct kmemleak_object, object_list); - if (get_object(next_obj)) + struct kmemleak_object *obj = + list_entry(n, struct kmemleak_object, object_list); + if (get_object(obj)) { + next_obj = obj; break; + } } put_object(prev_obj); @@ -1733,7 +1736,7 @@ static int __init kmemleak_late_init(void) if (atomic_read(&kmemleak_error)) { /* - * Some error occured and kmemleak was disabled. There is a + * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately * after setting kmemleak_initialized and we may end up with * two clean-up threads but serialized by scan_mutex. @@ -35,6 +35,7 @@ #include <linux/ksm.h> #include <linux/hash.h> #include <linux/freezer.h> +#include <linux/oom.h> #include <asm/tlbflush.h> #include "internal.h" @@ -301,20 +302,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) return rmap_item->address & STABLE_FLAG; } -static void hold_anon_vma(struct rmap_item *rmap_item, - struct anon_vma *anon_vma) -{ - rmap_item->anon_vma = anon_vma; - get_anon_vma(anon_vma); -} - -static void ksm_drop_anon_vma(struct rmap_item *rmap_item) -{ - struct anon_vma *anon_vma = rmap_item->anon_vma; - - drop_anon_vma(anon_vma); -} - /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -397,7 +384,7 @@ static void break_cow(struct rmap_item *rmap_item) * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ - ksm_drop_anon_vma(rmap_item); + put_anon_vma(rmap_item->anon_vma); down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) @@ -466,7 +453,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) ksm_pages_sharing--; else ksm_pages_shared--; - ksm_drop_anon_vma(rmap_item); + put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } @@ -554,7 +541,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) else ksm_pages_shared--; - ksm_drop_anon_vma(rmap_item); + put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { @@ -734,7 +721,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, swapped = PageSwapCache(page); flush_cache_page(vma, addr, page_to_pfn(page)); /* - * Ok this is tricky, when get_user_pages_fast() run it doesnt + * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make * with the pagecount against the mapcount is racey and * O_DIRECT can happen right after the check. @@ -949,7 +936,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, goto out; /* Must get reference to anon_vma while still holding mmap_sem */ - hold_anon_vma(rmap_item, vma->anon_vma); + rmap_item->anon_vma = vma->anon_vma; + get_anon_vma(vma->anon_vma); out: up_read(&mm->mmap_sem); return err; @@ -1314,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); ksm_scan.mm_slot = slot; spin_unlock(&ksm_mmlist_lock); + /* + * Although we tested list_empty() above, a racing __ksm_exit + * of the last mm on the list may have removed it since then. + */ + if (slot == &ksm_mm_head) + return NULL; next_mm: ksm_scan.address = 0; ksm_scan.rmap_list = &slot->rmap_list; @@ -1907,9 +1901,11 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { - current->flags |= PF_OOM_ORIGIN; + int oom_score_adj; + + oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); err = unmerge_and_remove_all_rmap_items(); - current->flags &= ~PF_OOM_ORIGIN; + test_set_oom_score_adj(oom_score_adj); if (err) { ksm_run = KSM_RUN_STOP; count = err; diff --git a/mm/maccess.c b/mm/maccess.c index e2b6f5634e0..4cee182ab5f 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -15,10 +15,10 @@ * happens, handle that and return -EFAULT. */ -long __weak probe_kernel_read(void *dst, void *src, size_t size) +long __weak probe_kernel_read(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_read"))); -long __probe_kernel_read(void *dst, void *src, size_t size) +long __probe_kernel_read(void *dst, const void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); @@ -43,10 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long __weak probe_kernel_write(void *dst, void *src, size_t size) +long __weak probe_kernel_write(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_write"))); -long __probe_kernel_write(void *dst, void *src, size_t size) +long __probe_kernel_write(void *dst, const void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed50..74bf193eff0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma, endoff = (loff_t)(end - vma->vm_start - 1) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + /* vmtruncate_range needs to take i_mutex */ up_read(¤t->mm->mmap_sem); error = vmtruncate_range(mapping->host, offset, endoff); down_read(¤t->mm->mmap_sem); diff --git a/mm/memblock.c b/mm/memblock.c index 4618fda975a..ccbf9733959 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -58,28 +58,6 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } -static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1, - phys_addr_t base2, phys_addr_t size2) -{ - if (base2 == base1 + size1) - return 1; - else if (base1 == base2 + size2) - return -1; - - return 0; -} - -static long __init_memblock memblock_regions_adjacent(struct memblock_type *type, - unsigned long r1, unsigned long r2) -{ - phys_addr_t base1 = type->regions[r1].base; - phys_addr_t size1 = type->regions[r1].size; - phys_addr_t base2 = type->regions[r2].base; - phys_addr_t size2 = type->regions[r2].size; - - return memblock_addrs_adjacent(base1, size1, base2, size2); -} - long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { unsigned long i; @@ -206,14 +184,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u type->regions[i].size = type->regions[i + 1].size; } type->cnt--; -} -/* Assumption: base addr of region 1 < base addr of region 2 */ -static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, - unsigned long r1, unsigned long r2) -{ - type->regions[r1].size += type->regions[r2].size; - memblock_remove_region(type, r2); + /* Special case for empty arrays */ + if (type->cnt == 0) { + type->cnt = 1; + type->regions[0].base = 0; + type->regions[0].size = 0; + } } /* Defined below but needed now */ @@ -276,7 +253,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); + BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size)); /* If the array wasn't our static init one, then free it. We only do * that before SLAB is available as later on, we don't know whether @@ -296,58 +273,99 @@ extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1 return 1; } -static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +static long __init_memblock memblock_add_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { - unsigned long coalesced = 0; - long adjacent, i; - - if ((type->cnt == 1) && (type->regions[0].size == 0)) { - type->regions[0].base = base; - type->regions[0].size = size; - return 0; - } + phys_addr_t end = base + size; + int i, slot = -1; - /* First try and coalesce this MEMBLOCK with another. */ + /* First try and coalesce this MEMBLOCK with others */ for (i = 0; i < type->cnt; i++) { - phys_addr_t rgnbase = type->regions[i].base; - phys_addr_t rgnsize = type->regions[i].size; + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rend = rgn->base + rgn->size; + + /* Exit if there's no possible hits */ + if (rgn->base > end || rgn->size == 0) + break; - if ((rgnbase == base) && (rgnsize == size)) - /* Already have this region, so we're done */ + /* Check if we are fully enclosed within an existing + * block + */ + if (rgn->base <= base && rend >= end) return 0; - adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); - /* Check if arch allows coalescing */ - if (adjacent != 0 && type == &memblock.memory && - !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) - break; - if (adjacent > 0) { - type->regions[i].base -= size; - type->regions[i].size += size; - coalesced++; - break; - } else if (adjacent < 0) { - type->regions[i].size += size; - coalesced++; - break; + /* Check if we overlap or are adjacent with the bottom + * of a block. + */ + if (base < rgn->base && end >= rgn->base) { + /* If we can't coalesce, create a new block */ + if (!memblock_memory_can_coalesce(base, size, + rgn->base, + rgn->size)) { + /* Overlap & can't coalesce are mutually + * exclusive, if you do that, be prepared + * for trouble + */ + WARN_ON(end != rgn->base); + goto new_block; + } + /* We extend the bottom of the block down to our + * base + */ + rgn->base = base; + rgn->size = rend - base; + + /* Return if we have nothing else to allocate + * (fully coalesced) + */ + if (rend >= end) + return 0; + + /* We continue processing from the end of the + * coalesced block. + */ + base = rend; + size = end - base; + } + + /* Now check if we overlap or are adjacent with the + * top of a block + */ + if (base <= rend && end >= rend) { + /* If we can't coalesce, create a new block */ + if (!memblock_memory_can_coalesce(rgn->base, + rgn->size, + base, size)) { + /* Overlap & can't coalesce are mutually + * exclusive, if you do that, be prepared + * for trouble + */ + WARN_ON(rend != base); + goto new_block; + } + /* We adjust our base down to enclose the + * original block and destroy it. It will be + * part of our new allocation. Since we've + * freed an entry, we know we won't fail + * to allocate one later, so we won't risk + * losing the original block allocation. + */ + size += (base - rgn->base); + base = rgn->base; + memblock_remove_region(type, i--); } } - /* If we plugged a hole, we may want to also coalesce with the - * next region + /* If the array is empty, special case, replace the fake + * filler region and return */ - if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && - ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, - type->regions[i].size, - type->regions[i+1].base, - type->regions[i+1].size)))) { - memblock_coalesce_regions(type, i, i+1); - coalesced++; + if ((type->cnt == 1) && (type->regions[0].size == 0)) { + type->regions[0].base = base; + type->regions[0].size = size; + return 0; } - if (coalesced) - return coalesced; - + new_block: /* If we are out of space, we fail. It's too late to resize the array * but then this shouldn't have happened in the first place. */ @@ -362,13 +380,14 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys } else { type->regions[i+1].base = base; type->regions[i+1].size = size; + slot = i + 1; break; } } - if (base < type->regions[0].base) { type->regions[0].base = base; type->regions[0].size = size; + slot = 0; } type->cnt++; @@ -376,7 +395,8 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys * our allocation and return an error */ if (type->cnt == type->max && memblock_double_array(type)) { - type->cnt--; + BUG_ON(slot < 0); + memblock_remove_region(type, slot); return -1; } @@ -389,52 +409,55 @@ long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) } -static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +static long __init_memblock __memblock_remove(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { - phys_addr_t rgnbegin, rgnend; phys_addr_t end = base + size; int i; - rgnbegin = rgnend = 0; /* supress gcc warnings */ - - /* Find the region where (base, size) belongs to */ - for (i=0; i < type->cnt; i++) { - rgnbegin = type->regions[i].base; - rgnend = rgnbegin + type->regions[i].size; + /* Walk through the array for collisions */ + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rend = rgn->base + rgn->size; - if ((rgnbegin <= base) && (end <= rgnend)) + /* Nothing more to do, exit */ + if (rgn->base > end || rgn->size == 0) break; - } - /* Didn't find the region */ - if (i == type->cnt) - return -1; + /* If we fully enclose the block, drop it */ + if (base <= rgn->base && end >= rend) { + memblock_remove_region(type, i--); + continue; + } - /* Check to see if we are removing entire region */ - if ((rgnbegin == base) && (rgnend == end)) { - memblock_remove_region(type, i); - return 0; - } + /* If we are fully enclosed within a block + * then we need to split it and we are done + */ + if (base > rgn->base && end < rend) { + rgn->size = base - rgn->base; + if (!memblock_add_region(type, end, rend - end)) + return 0; + /* Failure to split is bad, we at least + * restore the block before erroring + */ + rgn->size = rend - rgn->base; + WARN_ON(1); + return -1; + } - /* Check to see if region is matching at the front */ - if (rgnbegin == base) { - type->regions[i].base = end; - type->regions[i].size -= size; - return 0; - } + /* Check if we need to trim the bottom of a block */ + if (rgn->base < end && rend > end) { + rgn->size -= end - rgn->base; + rgn->base = end; + break; + } - /* Check to see if the region is matching at the end */ - if (rgnend == end) { - type->regions[i].size -= size; - return 0; - } + /* And check if we need to trim the top of a block */ + if (base < rend) + rgn->size -= rend - base; - /* - * We need to split the entry - adjust the current one to the - * beginging of the hole and add the region after hole. - */ - type->regions[i].size = base - type->regions[i].base; - return memblock_add_region(type, end, rgnend - end); + } + return 0; } long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) @@ -467,7 +490,7 @@ phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, ph found = memblock_find_base(size, align, 0, max_addr); if (found != MEMBLOCK_ERROR && - memblock_add_region(&memblock.reserved, found, size) >= 0) + !memblock_add_region(&memblock.reserved, found, size)) return found; return 0; @@ -548,7 +571,7 @@ static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, if (this_nid == nid) { phys_addr_t ret = memblock_find_region(start, this_end, size, align); if (ret != MEMBLOCK_ERROR && - memblock_add_region(&memblock.reserved, ret, size) >= 0) + !memblock_add_region(&memblock.reserved, ret, size)) return ret; } start = this_end; @@ -735,9 +758,9 @@ void __init memblock_analyze(void) /* Check marker in the unused last array entry */ WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base - != (phys_addr_t)RED_INACTIVE); + != MEMBLOCK_INACTIVE); WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base - != (phys_addr_t)RED_INACTIVE); + != MEMBLOCK_INACTIVE); memblock.memory_size = 0; @@ -763,8 +786,8 @@ void __init memblock_init(void) memblock.reserved.max = INIT_MEMBLOCK_REGIONS; /* Write a marker in the unused last array entry */ - memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; - memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; + memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; /* Create a dummy zero size MEMBLOCK which will get coalesced away later. * This simplifies the memblock_add() code below... diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da53a252b25..930de943727 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0; #define do_swap_account (0) #endif -/* - * Per memcg event counter is incremented at every pagein/pageout. This counter - * is used for trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - * - * These values will be used as !((event) & ((1 <<(thresh)) - 1)) - */ -#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ -#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ /* * Statistics for memory cgroup. @@ -93,19 +84,40 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ - MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ - /* incremented at every pagein/pageout */ - MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ - MEM_CGROUP_STAT_NSTATS, }; +enum mem_cgroup_events_index { + MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ + MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ + MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ + MEM_CGROUP_EVENTS_NSTATS, +}; +/* + * Per memcg event counter is incremented at every pagein/pageout. With THP, + * it will be incremated by the number of pages. This counter is used for + * for trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + */ +enum mem_cgroup_events_target { + MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_TARGET_NUMAINFO, + MEM_CGROUP_NTARGETS, +}; +#define THRESHOLDS_EVENTS_TARGET (128) +#define SOFTLIMIT_EVENTS_TARGET (1024) +#define NUMAINFO_EVENTS_TARGET (1024) + struct mem_cgroup_stat_cpu { - s64 count[MEM_CGROUP_STAT_NSTATS]; + long count[MEM_CGROUP_STAT_NSTATS]; + unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long targets[MEM_CGROUP_NTARGETS]; }; /* @@ -192,6 +204,50 @@ struct mem_cgroup_eventfd_list { static void mem_cgroup_threshold(struct mem_cgroup *mem); static void mem_cgroup_oom_notify(struct mem_cgroup *mem); +enum { + SCAN_BY_LIMIT, + SCAN_BY_SYSTEM, + NR_SCAN_CONTEXT, + SCAN_BY_SHRINK, /* not recorded now */ +}; + +enum { + SCAN, + SCAN_ANON, + SCAN_FILE, + ROTATE, + ROTATE_ANON, + ROTATE_FILE, + FREED, + FREED_ANON, + FREED_FILE, + ELAPSED, + NR_SCANSTATS, +}; + +struct scanstat { + spinlock_t lock; + unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; + unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; +}; + +const char *scanstat_string[NR_SCANSTATS] = { + "scanned_pages", + "scanned_anon_pages", + "scanned_file_pages", + "rotated_pages", + "rotated_anon_pages", + "rotated_file_pages", + "freed_pages", + "freed_anon_pages", + "freed_file_pages", + "elapsed_ns", +}; +#define SCANSTAT_WORD_LIMIT "_by_limit" +#define SCANSTAT_WORD_SYSTEM "_by_system" +#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" + + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -218,25 +274,28 @@ struct mem_cgroup { * per zone LRU lists. */ struct mem_cgroup_lru_info info; - - /* - protect against reclaim related member. - */ - spinlock_t reclaim_param_lock; - /* * While reclaiming in a hierarchy, we cache the last child we * reclaimed from. */ int last_scanned_child; + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + atomic_t numainfo_events; + atomic_t numainfo_updating; +#endif /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - atomic_t oom_lock; + + bool oom_lock; + atomic_t under_oom; + atomic_t refcnt; - unsigned int swappiness; + int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -254,7 +313,8 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - + /* For recording LRU-scan statistics */ + struct scanstat scanstat; /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -327,13 +387,6 @@ enum charge_type { NR_CHARGE_TYPE, }; -/* only for here (for easy reading.) */ -#define PCGF_CACHE (1UL << PCG_CACHE) -#define PCGF_USED (1UL << PCG_USED) -#define PCGF_LOCK (1UL << PCG_LOCK) -/* Not used, but added here for completeness */ -#define PCGF_ACCT (1UL << PCG_ACCT) - /* for encoding cft->private value on file */ #define _MEM (0) #define _MEMSWAP (1) @@ -357,7 +410,7 @@ enum charge_type { static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static void drain_all_stock_async(void); +static void drain_all_stock_async(struct mem_cgroup *mem); static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) @@ -371,14 +424,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct page_cgroup *pc) +page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) { - struct mem_cgroup *mem = pc->mem_cgroup; - int nid = page_cgroup_nid(pc); - int zid = page_cgroup_zid(pc); - - if (!mem) - return NULL; + int nid = page_to_nid(page); + int zid = page_zonenum(page); return mem_cgroup_zoneinfo(mem, nid, zid); } @@ -504,11 +553,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) } } -static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) -{ - return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; -} - static struct mem_cgroup_per_zone * __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) { @@ -565,11 +609,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) * common workload, threashold and synchonization as vmstat[] should be * implemented. */ -static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx) +static long mem_cgroup_read_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { + long val = 0; int cpu; - s64 val = 0; get_online_cpus(); for_each_online_cpu(cpu) @@ -583,15 +627,6 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, return val; } -static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) -{ - s64 ret; - - ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); - ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); - return ret; -} - static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, bool charge) { @@ -599,6 +634,32 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } +void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); +} + +void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); +} + +static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, + enum mem_cgroup_events_index idx) +{ + unsigned long val = 0; + int cpu; + + for_each_online_cpu(cpu) + val += per_cpu(mem->stat->events[idx], cpu); +#ifdef CONFIG_HOTPLUG_CPU + spin_lock(&mem->pcp_counter_lock); + val += mem->nocpu_base.events[idx]; + spin_unlock(&mem->pcp_counter_lock); +#endif + return val; +} + static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, bool file, int nr_pages) { @@ -611,39 +672,89 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); + __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); else { - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); + __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); preempt_enable(); } -static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, - enum lru_list idx) +unsigned long +mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, + unsigned int lru_mask) { - int nid, zid; struct mem_cgroup_per_zone *mz; + enum lru_list l; + unsigned long ret = 0; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + + for_each_lru(l) { + if (BIT(l) & lru_mask) + ret += MEM_CGROUP_ZSTAT(mz, l); + } + return ret; +} + +static unsigned long +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, + int nid, unsigned int lru_mask) +{ u64 total = 0; + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) + total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); - for_each_online_node(nid) - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - total += MEM_CGROUP_ZSTAT(mz, idx); - } return total; } -static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, + unsigned int lru_mask) { - s64 val; + int nid; + u64 total = 0; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); + return total; +} + +static bool __memcg_event_check(struct mem_cgroup *mem, int target) +{ + unsigned long val, next; + + val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); + next = this_cpu_read(mem->stat->targets[target]); + /* from time_after() in jiffies.h */ + return ((long)next - (long)val < 0); +} + +static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) +{ + unsigned long val, next; + + val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); - val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_NUMAINFO: + next = val + NUMAINFO_EVENTS_TARGET; + break; + default: + return; + } - return !(val & ((1 << event_mask_shift) - 1)); + this_cpu_write(mem->stat->targets[target], next); } /* @@ -653,10 +764,23 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) static void memcg_check_events(struct mem_cgroup *mem, struct page *page) { /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { + if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { mem_cgroup_threshold(mem); - if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) + __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); + if (unlikely(__memcg_event_check(mem, + MEM_CGROUP_TARGET_SOFTLIMIT))) { mem_cgroup_update_tree(mem, page); + __mem_cgroup_target_update(mem, + MEM_CGROUP_TARGET_SOFTLIMIT); + } +#if MAX_NUMNODES > 1 + if (unlikely(__memcg_event_check(mem, + MEM_CGROUP_TARGET_NUMAINFO))) { + atomic_inc(&mem->numainfo_events); + __mem_cgroup_target_update(mem, + MEM_CGROUP_TARGET_NUMAINFO); + } +#endif } } @@ -681,7 +805,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup, css); } -static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *mem = NULL; @@ -785,6 +909,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) return (mem == root_mem_cgroup); } +void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +{ + struct mem_cgroup *mem; + + if (!mm) + return; + + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) + goto out; + + switch (idx) { + case PGMAJFAULT: + mem_cgroup_pgmajfault(mem, 1); + break; + case PGFAULT: + mem_cgroup_pgfault(mem, 1); + break; + default: + BUG(); + } +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(mem_cgroup_count_vm_event); + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -815,7 +966,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); if (mem_cgroup_is_root(pc->mem_cgroup)) @@ -829,6 +980,32 @@ void mem_cgroup_del_lru(struct page *page) mem_cgroup_del_lru_list(page, page_lru(page)); } +/* + * Writeback is about to end against a page which has been marked for immediate + * reclaim. If it still appears to be reclaimable, move it to the tail of the + * inactive list. + */ +void mem_cgroup_rotate_reclaimable_page(struct page *page) +{ + struct mem_cgroup_per_zone *mz; + struct page_cgroup *pc; + enum lru_list lru = page_lru(page); + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(page); + /* unused or root page is not rotated. */ + if (!PageCgroupUsed(pc)) + return; + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); + list_move_tail(&pc->lru, &mz->lists[lru]); +} + void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) { struct mem_cgroup_per_zone *mz; @@ -845,7 +1022,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) smp_rmb(); if (mem_cgroup_is_root(pc->mem_cgroup)) return; - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move(&pc->lru, &mz->lists[lru]); } @@ -862,7 +1039,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); @@ -872,18 +1049,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) } /* - * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to - * lru because the page may.be reused after it's fully uncharged (because of - * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge - * it again. This function is only used to charge SwapCache. It's done under - * lock_page and expected that zone->lru_lock is never held. + * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed + * while it's linked to lru because the page may be reused after it's fully + * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. + * It's done under lock_page and expected that zone->lru_lock isnever held. */ -static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) +static void mem_cgroup_lru_del_before_commit(struct page *page) { unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Doing this check without taking ->lru_lock seems wrong but this + * is safe. Because if page_cgroup's USED bit is unset, the page + * will not be added to any memcg's LRU. If page_cgroup's USED bit is + * set, the commit after this will fail, anyway. + * This all charge/uncharge is done under some mutual execustion. + * So, we don't need to taking care of changes in USED bit. + */ + if (likely(!PageLRU(page))) + return; + spin_lock_irqsave(&zone->lru_lock, flags); /* * Forget old LRU when this page_cgroup is *not* used. This Used bit @@ -894,12 +1081,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) spin_unlock_irqrestore(&zone->lru_lock, flags); } -static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) +static void mem_cgroup_lru_add_after_commit(struct page *page) { unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); + /* taking care of that the page is added to LRU while we commit it */ + if (likely(!PageLRU(page))) + return; spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ if (PageLRU(page) && !PageCgroupAcctLRU(pc)) @@ -917,6 +1107,21 @@ void mem_cgroup_move_lists(struct page *page, mem_cgroup_add_lru_list(page, to); } +/* + * Checks whether given mem is same or in the root_mem's + * hierarchy subtree + */ +static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, + struct mem_cgroup *mem) +{ + if (root_mem != mem) { + return (root_mem->use_hierarchy && + css_is_ancestor(&mem->css, &root_mem->css)); + } + + return true; +} + int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; @@ -936,10 +1141,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) * enabled in "curr" and "curr" is a child of "mem" in *cgroup* * hierarchy(even if use_hierarchy is disabled in "mem"). */ - if (mem->use_hierarchy) - ret = css_is_ancestor(&curr->css, &mem->css); - else - ret = (curr == mem); + ret = mem_cgroup_same_or_subtree(mem, curr); css_put(&curr->css); return ret; } @@ -951,8 +1153,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ unsigned long gb; unsigned long inactive_ratio; - inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); - active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); + inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); + active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -991,23 +1193,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) unsigned long active; unsigned long inactive; - inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); - active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); + inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); + active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); return (active > inactive); } -unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, - struct zone *zone, - enum lru_list lru) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - return MEM_CGROUP_ZSTAT(mz, lru); -} - struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { @@ -1032,10 +1223,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); - mz = page_cgroup_zoneinfo(pc); - if (!mz) - return NULL; - + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); return &mz->reclaim_stat; } @@ -1067,9 +1255,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, if (scan >= nr_to_scan) break; - page = pc->page; if (unlikely(!PageCgroupUsed(pc))) continue; + + page = lookup_cgroup_page(pc); + if (unlikely(!PageLRU(page))) continue; @@ -1101,49 +1291,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) -static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) -{ - if (do_swap_account) { - if (res_counter_check_under_limit(&mem->res) && - res_counter_check_under_limit(&mem->memsw)) - return true; - } else - if (res_counter_check_under_limit(&mem->res)) - return true; - return false; -} - /** - * mem_cgroup_check_margin - check if the memory cgroup allows charging - * @mem: memory cgroup to check - * @bytes: the number of bytes the caller intends to charge + * mem_cgroup_margin - calculate chargeable space of a memory cgroup + * @mem: the memory cgroup * - * Returns a boolean value on whether @mem can be charged @bytes or - * whether this would exceed the limit. + * Returns the maximum amount of memory @mem can be charged with, in + * pages. */ -static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) { - if (!res_counter_check_margin(&mem->res, bytes)) - return false; - if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) - return false; - return true; + unsigned long long margin; + + margin = res_counter_margin(&mem->res); + if (do_swap_account) + margin = min(margin, res_counter_margin(&mem->memsw)); + return margin >> PAGE_SHIFT; } -static unsigned int get_swappiness(struct mem_cgroup *memcg) +int mem_cgroup_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; - unsigned int swappiness; /* root ? */ if (cgrp->parent == NULL) return vm_swappiness; - spin_lock(&memcg->reclaim_param_lock); - swappiness = memcg->swappiness; - spin_unlock(&memcg->reclaim_param_lock); - - return swappiness; + return memcg->swappiness; } static void mem_cgroup_start_move(struct mem_cgroup *mem) @@ -1207,10 +1380,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) to = mc.to; if (!from) goto unlock; - if (from == mem || to == mem - || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) - || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) - ret = true; + + ret = mem_cgroup_same_or_subtree(mem, from) + || mem_cgroup_same_or_subtree(mem, to); unlock: spin_unlock(&mc.lock); return ret; @@ -1359,18 +1531,191 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) rcu_read_unlock(); /* Updates scanning parameter */ - spin_lock(&root_mem->reclaim_param_lock); if (!css) { /* this means start scan from ID:1 */ root_mem->last_scanned_child = 0; } else root_mem->last_scanned_child = found; - spin_unlock(&root_mem->reclaim_param_lock); } return ret; } +/** + * test_mem_cgroup_node_reclaimable + * @mem: the target memcg + * @nid: the node ID to be checked. + * @noswap : specify true here if the user wants flle only information. + * + * This function returns whether the specified memcg contains any + * reclaimable pages on a node. Returns true if there are any reclaimable + * pages in the node. + */ +static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, + int nid, bool noswap) +{ + if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) + return true; + if (noswap || !total_swap_pages) + return false; + if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) + return true; + return false; + +} +#if MAX_NUMNODES > 1 + +/* + * Always updating the nodemask is not very good - even if we have an empty + * list or the wrong list here, we can start from some node and traverse all + * nodes based on the zonelist. So update the list loosely once per 10 secs. + * + */ +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +{ + int nid; + /* + * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET + * pagein/pageout changes since the last update. + */ + if (!atomic_read(&mem->numainfo_events)) + return; + if (atomic_inc_return(&mem->numainfo_updating) > 1) + return; + + /* make a nodemask where this memcg uses memory from */ + mem->scan_nodes = node_states[N_HIGH_MEMORY]; + + for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { + + if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) + node_clear(nid, mem->scan_nodes); + } + + atomic_set(&mem->numainfo_events, 0); + atomic_set(&mem->numainfo_updating, 0); +} + +/* + * Selecting a node where we start reclaim from. Because what we need is just + * reducing usage counter, start from anywhere is O,K. Considering + * memory reclaim from current node, there are pros. and cons. + * + * Freeing memory from current node means freeing memory from a node which + * we'll use or we've used. So, it may make LRU bad. And if several threads + * hit limits, it will see a contention on a node. But freeing from remote + * node means more costs for memory reclaim because of memory latency. + * + * Now, we use round-robin. Better algorithm is welcomed. + */ +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + int node; + + mem_cgroup_may_update_nodemask(mem); + node = mem->last_scanned_node; + + node = next_node(node, mem->scan_nodes); + if (node == MAX_NUMNODES) + node = first_node(mem->scan_nodes); + /* + * We call this when we hit limit, not when pages are added to LRU. + * No LRU may hold pages because all pages are UNEVICTABLE or + * memcg is too small and all pages are not on LRU. In that case, + * we use curret node. + */ + if (unlikely(node == MAX_NUMNODES)) + node = numa_node_id(); + + mem->last_scanned_node = node; + return node; +} + +/* + * Check all nodes whether it contains reclaimable pages or not. + * For quick scan, we make use of scan_nodes. This will allow us to skip + * unused nodes. But scan_nodes is lazily updated and may not cotain + * enough new information. We need to do double check. + */ +bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +{ + int nid; + + /* + * quick check...making use of scan_node. + * We can skip unused nodes. + */ + if (!nodes_empty(mem->scan_nodes)) { + for (nid = first_node(mem->scan_nodes); + nid < MAX_NUMNODES; + nid = next_node(nid, mem->scan_nodes)) { + + if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + return true; + } + } + /* + * Check rest of nodes. + */ + for_each_node_state(nid, N_HIGH_MEMORY) { + if (node_isset(nid, mem->scan_nodes)) + continue; + if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + return true; + } + return false; +} + +#else +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + return 0; +} + +bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +{ + return test_mem_cgroup_node_reclaimable(mem, 0, noswap); +} +#endif + +static void __mem_cgroup_record_scanstat(unsigned long *stats, + struct memcg_scanrecord *rec) +{ + + stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; + stats[SCAN_ANON] += rec->nr_scanned[0]; + stats[SCAN_FILE] += rec->nr_scanned[1]; + + stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; + stats[ROTATE_ANON] += rec->nr_rotated[0]; + stats[ROTATE_FILE] += rec->nr_rotated[1]; + + stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; + stats[FREED_ANON] += rec->nr_freed[0]; + stats[FREED_FILE] += rec->nr_freed[1]; + + stats[ELAPSED] += rec->elapsed; +} + +static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) +{ + struct mem_cgroup *mem; + int context = rec->context; + + if (context >= NR_SCAN_CONTEXT) + return; + + mem = rec->mem; + spin_lock(&mem->scanstat.lock); + __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); + spin_unlock(&mem->scanstat.lock); + + mem = rec->root; + spin_lock(&mem->scanstat.lock); + __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); + spin_unlock(&mem->scanstat.lock); +} + /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1386,7 +1731,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, struct zone *zone, gfp_t gfp_mask, - unsigned long reclaim_options) + unsigned long reclaim_options, + unsigned long *total_scanned) { struct mem_cgroup *victim; int ret, total = 0; @@ -1394,18 +1740,37 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; - unsigned long excess = mem_cgroup_get_excess(root_mem); + struct memcg_scanrecord rec; + unsigned long excess; + unsigned long scanned; + + excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (root_mem->memsw_is_minimum) + if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; + if (shrink) + rec.context = SCAN_BY_SHRINK; + else if (check_soft) + rec.context = SCAN_BY_SYSTEM; + else + rec.context = SCAN_BY_LIMIT; + + rec.root = root_mem; + while (1) { victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { loop++; - if (loop >= 1) - drain_all_stock_async(); + /* + * We are not draining per cpu cached charges during + * soft limit reclaim because global reclaim doesn't + * care about charges. It tries to free some memory and + * charges will not give any. + */ + if (!check_soft && loop >= 1) + drain_all_stock_async(root_mem); if (loop >= 2) { /* * If we have not been able to reclaim @@ -1417,7 +1782,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, break; } /* - * We want to do more targetted reclaim. + * We want to do more targeted reclaim. * excess >> 2 is not to excessive so as to * reclaim too much, nor too less that we keep * coming back to reclaim from this cgroup @@ -1429,18 +1794,28 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, } } } - if (!mem_cgroup_local_usage(victim)) { + if (!mem_cgroup_reclaimable(victim, noswap)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; } + rec.mem = victim; + rec.nr_scanned[0] = 0; + rec.nr_scanned[1] = 0; + rec.nr_rotated[0] = 0; + rec.nr_rotated[1] = 0; + rec.nr_freed[0] = 0; + rec.nr_freed[1] = 0; + rec.elapsed = 0; /* we use swappiness of local cgroup */ - if (check_soft) + if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone); - else + noswap, zone, &rec, &scanned); + *total_scanned += scanned; + } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap, get_swappiness(victim)); + noswap, &rec); + mem_cgroup_record_scanstat(&rec); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -1451,10 +1826,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return ret; total += ret; if (check_soft) { - if (res_counter_check_under_soft_limit(&root_mem->res)) + if (!res_counter_soft_limit_excess(&root_mem->res)) return total; - } else if (mem_cgroup_check_under_limit(root_mem)) - return 1 + total; + } else if (mem_cgroup_margin(root_mem)) + return total; } return total; } @@ -1462,38 +1837,84 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. + * Has to be called with memcg_oom_lock */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - int x, lock_count = 0; - struct mem_cgroup *iter; + int lock_count = -1; + struct mem_cgroup *iter, *failed = NULL; + bool cond = true; - for_each_mem_cgroup_tree(iter, mem) { - x = atomic_inc_return(&iter->oom_lock); - lock_count = max(x, lock_count); + for_each_mem_cgroup_tree_cond(iter, mem, cond) { + bool locked = iter->oom_lock; + + iter->oom_lock = true; + if (lock_count == -1) + lock_count = iter->oom_lock; + else if (lock_count != locked) { + /* + * this subtree of our hierarchy is already locked + * so we cannot give a lock. + */ + lock_count = 0; + failed = iter; + cond = false; + } } - if (lock_count == 1) - return true; - return false; + if (!failed) + goto done; + + /* + * OK, we failed to lock the whole subtree so we have to clean up + * what we set up to the failing subtree + */ + cond = true; + for_each_mem_cgroup_tree_cond(iter, mem, cond) { + if (iter == failed) { + cond = false; + continue; + } + iter->oom_lock = false; + } +done: + return lock_count; } +/* + * Has to be called with memcg_oom_lock + */ static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) { struct mem_cgroup *iter; + for_each_mem_cgroup_tree(iter, mem) + iter->oom_lock = false; + return 0; +} + +static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + atomic_inc(&iter->under_oom); +} + +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) +{ + struct mem_cgroup *iter; + /* * When a new child is created while the hierarchy is under oom, * mem_cgroup_oom_lock() may not be called. We have to use * atomic_add_unless() here. */ for_each_mem_cgroup_tree(iter, mem) - atomic_add_unless(&iter->oom_lock, -1, 0); - return 0; + atomic_add_unless(&iter->under_oom, -1, 0); } - -static DEFINE_MUTEX(memcg_oom_mutex); +static DEFINE_SPINLOCK(memcg_oom_lock); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { @@ -1504,25 +1925,20 @@ struct oom_wait_info { static int memcg_oom_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; + struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, + *oom_wait_mem; struct oom_wait_info *oom_wait_info; oom_wait_info = container_of(wait, struct oom_wait_info, wait); + oom_wait_mem = oom_wait_info->mem; - if (oom_wait_info->mem == wake_mem) - goto wakeup; - /* if no hierarchy, no match */ - if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) - return 0; /* * Both of oom_wait_info->mem and wake_mem are stable under us. * Then we can use css_is_ancestor without taking care of RCU. */ - if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && - !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) + if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) + && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) return 0; - -wakeup: return autoremove_wake_function(wait, mode, sync, arg); } @@ -1534,7 +1950,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) static void memcg_oom_recover(struct mem_cgroup *mem) { - if (mem && atomic_read(&mem->oom_lock)) + if (mem && atomic_read(&mem->under_oom)) memcg_wakeup_oom(mem); } @@ -1552,8 +1968,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); need_to_kill = true; + mem_cgroup_mark_under_oom(mem); + /* At first, try to OOM lock hierarchy under mem.*/ - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); locked = mem_cgroup_oom_lock(mem); /* * Even if signal_pending(), we can't quit charge() loop without @@ -1565,7 +1983,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) need_to_kill = false; if (locked) mem_cgroup_oom_notify(mem); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); if (need_to_kill) { finish_wait(&memcg_oom_waitq, &owait.wait); @@ -1574,10 +1992,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) schedule(); finish_wait(&memcg_oom_waitq, &owait.wait); } - mutex_lock(&memcg_oom_mutex); - mem_cgroup_oom_unlock(mem); + spin_lock(&memcg_oom_lock); + if (locked) + mem_cgroup_oom_unlock(mem); memcg_wakeup_oom(mem); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); + + mem_cgroup_unmark_under_oom(mem); if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) return false; @@ -1661,17 +2082,19 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat); * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. */ -#define CHARGE_SIZE (32 * PAGE_SIZE) +#define CHARGE_BATCH 32U struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ - int charge; + unsigned int nr_pages; struct work_struct work; + unsigned long flags; +#define FLUSHING_CACHED_CHARGE (0) }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); -static atomic_t memcg_drain_count; +static DEFINE_MUTEX(percpu_charge_mutex); /* - * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed + * Try to consume stocked charge on this cpu. If success, one page is consumed * from local stock and true is returned. If the stock is 0 or charges from a * cgroup which is not current target, returns false. This stock will be * refilled. @@ -1682,8 +2105,8 @@ static bool consume_stock(struct mem_cgroup *mem) bool ret = true; stock = &get_cpu_var(memcg_stock); - if (mem == stock->cached && stock->charge) - stock->charge -= PAGE_SIZE; + if (mem == stock->cached && stock->nr_pages) + stock->nr_pages--; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -1697,13 +2120,15 @@ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; - if (stock->charge) { - res_counter_uncharge(&old->res, stock->charge); + if (stock->nr_pages) { + unsigned long bytes = stock->nr_pages * PAGE_SIZE; + + res_counter_uncharge(&old->res, bytes); if (do_swap_account) - res_counter_uncharge(&old->memsw, stock->charge); + res_counter_uncharge(&old->memsw, bytes); + stock->nr_pages = 0; } stock->cached = NULL; - stock->charge = 0; } /* @@ -1714,13 +2139,14 @@ static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } /* * Cache charges(val) which is from res_counter, to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *mem, int val) +static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) { struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); @@ -1728,46 +2154,81 @@ static void refill_stock(struct mem_cgroup *mem, int val) drain_stock(stock); stock->cached = mem; } - stock->charge += val; + stock->nr_pages += nr_pages; put_cpu_var(memcg_stock); } /* - * Tries to drain stocked charges in other cpus. This function is asynchronous - * and just put a work per cpu for draining localy on each cpu. Caller can - * expects some charges will be back to res_counter later but cannot wait for - * it. + * Drains all per-CPU charge caches for given root_mem resp. subtree + * of the hierarchy under it. sync flag says whether we should block + * until the work is done. */ -static void drain_all_stock_async(void) +static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) { - int cpu; - /* This function is for scheduling "drain" in asynchronous way. - * The result of "drain" is not directly handled by callers. Then, - * if someone is calling drain, we don't have to call drain more. - * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if - * there is a race. We just do loose check here. - */ - if (atomic_read(&memcg_drain_count)) - return; + int cpu, curcpu; + /* Notify other cpus that system-wide "drain" is running */ - atomic_inc(&memcg_drain_count); get_online_cpus(); + /* + * Get a hint for avoiding draining charges on the current cpu, + * which must be exhausted by our charging. It is not required that + * this be a precise check, so we use raw_smp_processor_id() instead of + * getcpu()/putcpu(). + */ + curcpu = raw_smp_processor_id(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + struct mem_cgroup *mem; + + mem = stock->cached; + if (!mem || !stock->nr_pages) + continue; + if (!mem_cgroup_same_or_subtree(root_mem, mem)) + continue; + if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (cpu == curcpu) + drain_local_stock(&stock->work); + else + schedule_work_on(cpu, &stock->work); + } + } + + if (!sync) + goto out; + for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - schedule_work_on(cpu, &stock->work); + if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + flush_work(&stock->work); } +out: put_online_cpus(); - atomic_dec(&memcg_drain_count); - /* We don't wait for flush_work */ +} + +/* + * Tries to drain stocked charges in other cpus. This function is asynchronous + * and just put a work per cpu for draining localy on each cpu. Caller can + * expects some charges will be back to res_counter later but cannot wait for + * it. + */ +static void drain_all_stock_async(struct mem_cgroup *root_mem) +{ + /* + * If someone calls draining, avoid adding more kworker runs. + */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; + drain_all_stock(root_mem, false); + mutex_unlock(&percpu_charge_mutex); } /* This is a synchronous drain interface. */ -static void drain_all_stock_sync(void) +static void drain_all_stock_sync(struct mem_cgroup *root_mem) { /* called when force_empty is called */ - atomic_inc(&memcg_drain_count); - schedule_on_each_cpu(drain_local_stock); - atomic_dec(&memcg_drain_count); + mutex_lock(&percpu_charge_mutex); + drain_all_stock(root_mem, true); + mutex_unlock(&percpu_charge_mutex); } /* @@ -1780,11 +2241,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) spin_lock(&mem->pcp_counter_lock); for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { - s64 x = per_cpu(mem->stat->count[i], cpu); + long x = per_cpu(mem->stat->count[i], cpu); per_cpu(mem->stat->count[i], cpu) = 0; mem->nocpu_base.count[i] += x; } + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + unsigned long x = per_cpu(mem->stat->events[i], cpu); + + per_cpu(mem->stat->events[i], cpu) = 0; + mem->nocpu_base.events[i] += x; + } /* need to clear ON_MOVE value, works as a kind of lock. */ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; spin_unlock(&mem->pcp_counter_lock); @@ -1834,9 +2301,10 @@ enum { CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; -static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, - int csize, bool oom_check) +static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, + unsigned int nr_pages, bool oom_check) { + unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; unsigned long flags = 0; @@ -1857,22 +2325,21 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * csize can be either a huge page (HPAGE_SIZE), a batch of - * regular pages (CHARGE_SIZE), or a single regular page - * (PAGE_SIZE). + * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch + * of regular pages (CHARGE_BATCH), or a single regular page (1). * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (csize == CHARGE_SIZE) + if (nr_pages == CHARGE_BATCH) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); - if (mem_cgroup_check_margin(mem_over_limit, csize)) + gfp_mask, flags, NULL); + if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* * Even though the limit is exceeded at this point, reclaim @@ -1883,7 +2350,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (csize == PAGE_SIZE && ret) + if (nr_pages == 1 && ret) return CHARGE_RETRY; /* @@ -1909,13 +2376,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, - struct mem_cgroup **memcg, bool oom, - int page_size) + unsigned int nr_pages, + struct mem_cgroup **memcg, + bool oom) { + unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1940,7 +2408,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (page_size == PAGE_SIZE && consume_stock(mem)) + if (nr_pages == 1 && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1963,7 +2431,7 @@ again: rcu_read_unlock(); goto done; } - if (page_size == PAGE_SIZE && consume_stock(mem)) { + if (nr_pages == 1 && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1998,13 +2466,12 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); - + ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); switch (ret) { case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = page_size; + batch = nr_pages; css_put(&mem->css); mem = NULL; goto again; @@ -2025,8 +2492,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > page_size) - refill_stock(mem, csize - page_size); + if (batch > nr_pages) + refill_stock(mem, batch - nr_pages); css_put(&mem->css); done: *memcg = mem; @@ -2045,21 +2512,17 @@ bypass: * gotten by try_charge(). */ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, - unsigned long count) + unsigned int nr_pages) { if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE * count); + unsigned long bytes = nr_pages * PAGE_SIZE; + + res_counter_uncharge(&mem->res, bytes); if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); + res_counter_uncharge(&mem->memsw, bytes); } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, - int page_size) -{ - __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); -} - /* * A helper function to get mem_cgroup from ID. must be called under * rcu_read_lock(). The caller must check css_is_removed() or some if @@ -2108,20 +2571,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) } static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page *page, + unsigned int nr_pages, struct page_cgroup *pc, - enum charge_type ctype, - int page_size) + enum charge_type ctype) { - int nr_pages = page_size >> PAGE_SHIFT; - - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem, page_size); + __mem_cgroup_cancel_charge(mem, nr_pages); return; } /* @@ -2158,7 +2616,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ - memcg_check_events(mem, pc->page); + memcg_check_events(mem, page); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2195,7 +2653,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) * We hold lru_lock, then, reduce counter directly. */ lru = page_lru(head); - mz = page_cgroup_zoneinfo(head_pc); + mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); MEM_CGROUP_ZSTAT(mz, lru) -= 1; } tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; @@ -2204,7 +2662,9 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) #endif /** - * __mem_cgroup_move_account - move account of the page + * mem_cgroup_move_account - move account of the page + * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. @@ -2212,25 +2672,42 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) - * - the pc is locked, used, and ->mem_cgroup points to @from. + * - compound_lock is held when nr_pages > 1 * * This function doesn't do "charge" nor css_get to new cgroup. It should be - * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is + * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is * true, this function does "uncharge" from old cgroup, but it doesn't if * @uncharge is false, so a caller should do "uncharge". */ - -static void __mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, - int charge_size) +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct page_cgroup *pc, + struct mem_cgroup *from, + struct mem_cgroup *to, + bool uncharge) { - int nr_pages = charge_size >> PAGE_SHIFT; + unsigned long flags; + int ret; VM_BUG_ON(from == to); - VM_BUG_ON(PageLRU(pc->page)); - VM_BUG_ON(!page_is_cgroup_locked(pc)); - VM_BUG_ON(!PageCgroupUsed(pc)); - VM_BUG_ON(pc->mem_cgroup != from); + VM_BUG_ON(PageLRU(page)); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (nr_pages > 1 && !PageTransHuge(page)) + goto out; + + lock_page_cgroup(pc); + + ret = -EINVAL; + if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) + goto unlock; + + move_lock_page_cgroup(pc, &flags); if (PageCgroupFileMapped(pc)) { /* Update mapped_file data for mem_cgroup */ @@ -2242,7 +2719,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from, charge_size); + __mem_cgroup_cancel_charge(from, nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -2251,43 +2728,19 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of * this function is just force_empty() and move charge, so it's - * garanteed that "to" is never removed. So, we don't check rmdir + * guaranteed that "to" is never removed. So, we don't check rmdir * status here. */ -} - -/* - * check whether the @pc is valid for moving account and call - * __mem_cgroup_move_account() - */ -static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, - bool uncharge, int charge_size) -{ - int ret = -EINVAL; - unsigned long flags; - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) - return -EBUSY; - - lock_page_cgroup(pc); - if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { - move_lock_page_cgroup(pc, &flags); - __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); - move_unlock_page_cgroup(pc, &flags); - ret = 0; - } + move_unlock_page_cgroup(pc, &flags); + ret = 0; +unlock: unlock_page_cgroup(pc); /* * check events */ - memcg_check_events(to, pc->page); - memcg_check_events(from, pc->page); + memcg_check_events(to, page); + memcg_check_events(from, page); +out: return ret; } @@ -2295,16 +2748,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, * move charges to its parent. */ -static int mem_cgroup_move_parent(struct page_cgroup *pc, +static int mem_cgroup_move_parent(struct page *page, + struct page_cgroup *pc, struct mem_cgroup *child, gfp_t gfp_mask) { - struct page *page = pc->page; struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; - int page_size = PAGE_SIZE; - unsigned long flags; + unsigned int nr_pages; + unsigned long uninitialized_var(flags); int ret; /* Is ROOT ? */ @@ -2317,23 +2770,21 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (isolate_lru_page(page)) goto put; - if (PageTransHuge(page)) - page_size = HPAGE_SIZE; + nr_pages = hpage_nr_pages(page); parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, - &parent, false, page_size); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); if (ret || !parent) goto put_back; - if (page_size > PAGE_SIZE) + if (nr_pages > 1) flags = compound_lock_irqsave(page); - ret = mem_cgroup_move_account(pc, child, parent, true, page_size); + ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); if (ret) - mem_cgroup_cancel_charge(parent, page_size); + __mem_cgroup_cancel_charge(parent, nr_pages); - if (page_size > PAGE_SIZE) + if (nr_pages > 1) compound_unlock_irqrestore(page, flags); put_back: putback_lru_page(page); @@ -2353,13 +2804,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; - int page_size = PAGE_SIZE; + unsigned int nr_pages = 1; struct page_cgroup *pc; bool oom = true; int ret; if (PageTransHuge(page)) { - page_size <<= compound_order(page); + nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); /* * Never OOM-kill a process for a huge page. The @@ -2369,16 +2820,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, } pc = lookup_page_cgroup(page); - /* can happen at boot */ - if (unlikely(!pc)) - return 0; - prefetchw(pc); + BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); + ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype, page_size); + __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); return 0; } @@ -2406,51 +2854,52 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype); +static void +__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, + enum charge_type ctype) +{ + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * In some case, SwapCache, FUSE(splice_buf->radixtree), the page + * is already on LRU. It means the page may on some other page_cgroup's + * LRU. Take care of it. + */ + mem_cgroup_lru_del_before_commit(page); + __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); + mem_cgroup_lru_add_after_commit(page); + return; +} + int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + struct mem_cgroup *mem = NULL; int ret; if (mem_cgroup_disabled()) return 0; if (PageCompound(page)) return 0; - /* - * Corner case handling. This is called from add_to_page_cache() - * in usual. But some FS (shmem) precharges this page before calling it - * and call add_to_page_cache() with GFP_NOWAIT. - * - * For GFP_NOWAIT case, the page may be pre-charged before calling - * add_to_page_cache(). (See shmem.c) check it here and avoid to call - * charge twice. (It works but has to pay a bit larger cost.) - * And when the page is SwapCache, it should take swap information - * into account. This is under lock_page() now. - */ - if (!(gfp_mask & __GFP_WAIT)) { - struct page_cgroup *pc; - - pc = lookup_page_cgroup(page); - if (!pc) - return 0; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - unlock_page_cgroup(pc); - return 0; - } - unlock_page_cgroup(pc); - } if (unlikely(!mm)) mm = &init_mm; - if (page_is_file_cache(page)) - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE); + if (page_is_file_cache(page)) { + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); + if (ret || !mem) + return ret; + /* + * FUSE reuses pages without going through the final + * put that would remove them from the LRU list, make + * sure that they get relinked properly. + */ + __mem_cgroup_commit_charge_lrucare(page, mem, + MEM_CGROUP_CHARGE_TYPE_CACHE); + return ret; + } /* shmem */ if (PageSwapCache(page)) { - struct mem_cgroup *mem = NULL; - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); if (!ret) __mem_cgroup_commit_charge_swapin(page, mem, @@ -2475,6 +2924,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct mem_cgroup *mem; int ret; + *ptr = NULL; + if (mem_cgroup_disabled()) return 0; @@ -2492,30 +2943,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); + return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); } static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype) { - struct page_cgroup *pc; - if (mem_cgroup_disabled()) return; if (!ptr) return; cgroup_exclude_rmdir(&ptr->css); - pc = lookup_page_cgroup(page); - mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); - mem_cgroup_lru_add_after_commit_swapcache(page); + + __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -2563,15 +3010,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem, PAGE_SIZE); + __mem_cgroup_cancel_charge(mem, 1); } -static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, - int page_size) +static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, + unsigned int nr_pages, + const enum charge_type ctype) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; + /* If swapout, usage of swap doesn't decrease */ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) uncharge_memsw = false; @@ -2586,7 +3034,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, batch->memcg = mem; /* * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continously can be expected to be in + * In those cases, all pages freed continuously can be expected to be in * the same cgroup and we have chance to coalesce uncharges. * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) * because we want to do uncharge as soon as possible. @@ -2595,7 +3043,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; - if (page_size != PAGE_SIZE) + if (nr_pages > 1) goto direct_uncharge; /* @@ -2606,14 +3054,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, if (batch->memcg != mem) goto direct_uncharge; /* remember freed charge and uncharge it later */ - batch->bytes += PAGE_SIZE; + batch->nr_pages++; if (uncharge_memsw) - batch->memsw_bytes += PAGE_SIZE; + batch->memsw_nr_pages++; return; direct_uncharge: - res_counter_uncharge(&mem->res, page_size); + res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, page_size); + res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2625,10 +3073,9 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { - int count; - struct page_cgroup *pc; struct mem_cgroup *mem = NULL; - int page_size = PAGE_SIZE; + unsigned int nr_pages = 1; + struct page_cgroup *pc; if (mem_cgroup_disabled()) return NULL; @@ -2637,11 +3084,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) return NULL; if (PageTransHuge(page)) { - page_size <<= compound_order(page); + nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); } - - count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2674,7 +3119,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); ClearPageCgroupUsed(pc); /* @@ -2695,7 +3140,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype, page_size); + mem_cgroup_do_uncharge(mem, nr_pages, ctype); return mem; @@ -2735,8 +3180,8 @@ void mem_cgroup_uncharge_start(void) /* We can do nest. */ if (current->memcg_batch.do_batch == 1) { current->memcg_batch.memcg = NULL; - current->memcg_batch.bytes = 0; - current->memcg_batch.memsw_bytes = 0; + current->memcg_batch.nr_pages = 0; + current->memcg_batch.memsw_nr_pages = 0; } } @@ -2757,10 +3202,12 @@ void mem_cgroup_uncharge_end(void) * This "batch->memcg" is valid without any css_get/put etc... * bacause we hide charges behind us. */ - if (batch->bytes) - res_counter_uncharge(&batch->memcg->res, batch->bytes); - if (batch->memsw_bytes) - res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + if (batch->nr_pages) + res_counter_uncharge(&batch->memcg->res, + batch->nr_pages * PAGE_SIZE); + if (batch->memsw_nr_pages) + res_counter_uncharge(&batch->memcg->memsw, + batch->memsw_nr_pages * PAGE_SIZE); memcg_oom_recover(batch->memcg); /* forget this pointer (for sanity check) */ batch->memcg = NULL; @@ -2883,13 +3330,15 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * page belongs to. */ int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr) + struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) { - struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + struct page_cgroup *pc; enum charge_type ctype; int ret = 0; + *ptr = NULL; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2940,7 +3389,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2967,7 +3416,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); + __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); return ret; } @@ -3020,30 +3469,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, cgroup_release_and_wakeup_rmdir(&mem->css); } -/* - * A call to try to shrink memory usage on charge failure at shmem's swapin. - * Calling hierarchical_reclaim is not enough because we should update - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. - * Moreover considering hierarchy, we should reclaim from the mem_over_limit, - * not from the memcg which this page would be charged to. - * try_charge_swapin does all of these works properly. - */ -int mem_cgroup_shmem_charge_fallback(struct page *page, - struct mm_struct *mm, - gfp_t gfp_mask) +#ifdef CONFIG_DEBUG_VM +static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { - struct mem_cgroup *mem = NULL; - int ret; + struct page_cgroup *pc; + + pc = lookup_page_cgroup(page); + if (likely(pc) && PageCgroupUsed(pc)) + return pc; + return NULL; +} +bool mem_cgroup_bad_page_check(struct page *page) +{ if (mem_cgroup_disabled()) - return 0; + return false; - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); - if (!ret) - mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ + return lookup_page_cgroup_used(page) != NULL; +} - return ret; +void mem_cgroup_print_bad_page(struct page *page) +{ + struct page_cgroup *pc; + + pc = lookup_page_cgroup_used(page); + if (pc) { + int ret = -1; + char *path; + + printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", + pc, pc->flags, pc->mem_cgroup); + + path = kmalloc(PATH_MAX, GFP_KERNEL); + if (path) { + rcu_read_lock(); + ret = cgroup_path(pc->mem_cgroup->css.cgroup, + path, PATH_MAX); + rcu_read_unlock(); + } + + printk(KERN_CONT "(%s)\n", + (ret < 0) ? "cannot get the path" : path); + kfree(path); + } } +#endif static DEFINE_MUTEX(set_limit_mutex); @@ -3102,7 +3572,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK); + MEM_CGROUP_RECLAIM_SHRINK, + NULL); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3162,7 +3633,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK); + MEM_CGROUP_RECLAIM_SHRINK, + NULL); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3176,7 +3648,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, + unsigned long *total_scanned) { unsigned long nr_reclaimed = 0; struct mem_cgroup_per_zone *mz, *next_mz = NULL; @@ -3184,6 +3657,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, int loop = 0; struct mem_cgroup_tree_per_zone *mctz; unsigned long long excess; + unsigned long nr_scanned; if (order > 0) return 0; @@ -3202,10 +3676,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, if (!mz) break; + nr_scanned = 0; reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, gfp_mask, - MEM_CGROUP_RECLAIM_SOFT); + MEM_CGROUP_RECLAIM_SOFT, + &nr_scanned); nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; spin_lock(&mctz->lock); /* @@ -3228,10 +3705,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, */ next_mz = __mem_cgroup_largest_soft_limit_node(mctz); - if (next_mz == mz) { + if (next_mz == mz) css_put(&next_mz->mem->css); - next_mz = NULL; - } else /* next_mz == NULL or other memcg */ + else /* next_mz == NULL or other memcg */ break; } while (1); } @@ -3288,6 +3764,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, loop += 256; busy = NULL; while (loop--) { + struct page *page; + ret = 0; spin_lock_irqsave(&zone->lru_lock, flags); if (list_empty(list)) { @@ -3303,7 +3781,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, } spin_unlock_irqrestore(&zone->lru_lock, flags); - ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); + page = lookup_cgroup_page(pc); + + ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); if (ret == -ENOMEM) break; @@ -3347,7 +3827,7 @@ move_account: goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); - drain_all_stock_sync(); + drain_all_stock_sync(mem); ret = 0; mem_cgroup_start_move(mem); for_each_node_state(node, N_HIGH_MEMORY) { @@ -3386,14 +3866,18 @@ try_to_free: /* try to free all pages in this cgroup */ shrink = 1; while (nr_retries && mem->res.usage > 0) { + struct memcg_scanrecord rec; int progress; if (signal_pending(current)) { ret = -EINTR; goto out; } + rec.context = SCAN_BY_SHRINK; + rec.mem = mem; + rec.root = mem; progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false, get_swappiness(mem)); + false, &rec); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -3451,13 +3935,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, } -static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx) +static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; - s64 val = 0; + long val = 0; - /* each per cpu's value can be minus.Then, use s64 */ + /* Per-cpu values can be negative, use a signed accumulator */ for_each_mem_cgroup_tree(iter, mem) val += mem_cgroup_read_stat(iter, idx); @@ -3477,12 +3961,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) return res_counter_read_u64(&mem->memsw, RES_USAGE); } - val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); if (swap) - val += mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_SWAPOUT); + val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); return val << PAGE_SHIFT; } @@ -3660,6 +4143,8 @@ enum { MCS_PGPGIN, MCS_PGPGOUT, MCS_SWAP, + MCS_PGFAULT, + MCS_PGMAJFAULT, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -3682,6 +4167,8 @@ struct { {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, {"swap", "total_swap"}, + {"pgfault", "total_pgfault"}, + {"pgmajfault", "total_pgmajfault"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -3702,25 +4189,29 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) s->stat[MCS_RSS] += val * PAGE_SIZE; val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); s->stat[MCS_PGPGOUT] += val; if (do_swap_account) { val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); s->stat[MCS_SWAP] += val * PAGE_SIZE; } + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); + s->stat[MCS_PGFAULT] += val; + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); + s->stat[MCS_PGMAJFAULT] += val; /* per zone stat */ - val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); + val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; } @@ -3733,6 +4224,53 @@ mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) mem_cgroup_get_local_stat(iter, s); } +#ifdef CONFIG_NUMA +static int mem_control_numa_stat_show(struct seq_file *m, void *arg) +{ + int nid; + unsigned long total_nr, file_nr, anon_nr, unevictable_nr; + unsigned long node_nr; + struct cgroup *cont = m->private; + struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); + + total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); + seq_printf(m, "total=%lu", total_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); + seq_printf(m, "file=%lu", file_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + LRU_ALL_FILE); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); + seq_printf(m, "anon=%lu", anon_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + LRU_ALL_ANON); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); + seq_printf(m, "unevictable=%lu", unevictable_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + BIT(LRU_UNEVICTABLE)); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + return 0; +} +#endif /* CONFIG_NUMA */ + static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { @@ -3743,6 +4281,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_local_stat(mem_cont, &mystat); + for (i = 0; i < NR_MCS_STAT; i++) { if (i == MCS_SWAP && !do_swap_account) continue; @@ -3802,7 +4341,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - return get_swappiness(memcg); + return mem_cgroup_swappiness(memcg); } static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, @@ -3828,9 +4367,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; } - spin_lock(&memcg->reclaim_param_lock); memcg->swappiness = val; - spin_unlock(&memcg->reclaim_param_lock); cgroup_unlock(); @@ -4094,15 +4631,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, if (!event) return -ENOMEM; - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); event->eventfd = eventfd; list_add(&event->list, &memcg->oom_notify); /* already in OOM ? */ - if (atomic_read(&memcg->oom_lock)) + if (atomic_read(&memcg->under_oom)) eventfd_signal(eventfd, 1); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); return 0; } @@ -4116,7 +4653,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, BUG_ON(type != _OOM_TYPE); - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { if (ev->eventfd == eventfd) { @@ -4125,7 +4662,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, } } - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); } static int mem_cgroup_oom_control_read(struct cgroup *cgrp, @@ -4135,7 +4672,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); - if (atomic_read(&mem->oom_lock)) + if (atomic_read(&mem->under_oom)) cb->fill(cb, "under_oom", 1); else cb->fill(cb, "under_oom", 0); @@ -4168,6 +4705,70 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, return 0; } +#ifdef CONFIG_NUMA +static const struct file_operations mem_control_numa_stat_file_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int mem_control_numa_stat_open(struct inode *unused, struct file *file) +{ + struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; + + file->f_op = &mem_control_numa_stat_file_operations; + return single_open(file, mem_control_numa_stat_show, cont); +} +#endif /* CONFIG_NUMA */ + +static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, + struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + char string[64]; + int i; + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_LIMIT); + cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); + } + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_SYSTEM); + cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); + } + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_LIMIT); + strcat(string, SCANSTAT_WORD_HIERARCHY); + cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); + } + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_SYSTEM); + strcat(string, SCANSTAT_WORD_HIERARCHY); + cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); + } + return 0; +} + +static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, + unsigned int event) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + spin_lock(&mem->scanstat.lock); + memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); + memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); + spin_unlock(&mem->scanstat.lock); + return 0; +} + + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4231,6 +4832,18 @@ static struct cftype mem_cgroup_files[] = { .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, +#ifdef CONFIG_NUMA + { + .name = "numa_stat", + .open = mem_control_numa_stat_open, + .mode = S_IRUGO, + }, +#endif + { + .name = "vmscan_stat", + .read_map = mem_cgroup_vmscan_stat_read, + .trigger = mem_cgroup_reset_vmscan_stat, + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -4486,14 +5099,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->memsw, NULL); } mem->last_scanned_child = 0; - spin_lock_init(&mem->reclaim_param_lock); + mem->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&mem->oom_notify); if (parent) - mem->swappiness = get_swappiness(parent); + mem->swappiness = mem_cgroup_swappiness(parent); atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); + spin_lock_init(&mem->scanstat.lock); return &mem->css; free_out: __mem_cgroup_free(mem); @@ -4574,8 +5188,7 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, - PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4675,15 +5288,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - if (!mapping_cap_swap_backed(mapping)) { /* normal file */ - page = find_get_page(mapping, pgoff); - } else { /* shmem/tmpfs file. we should take account of swap too. */ - swp_entry_t ent; - mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); + page = find_get_page(mapping, pgoff); + +#ifdef CONFIG_SWAP + /* shmem/tmpfs may report page out on swap: account for that too. */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) - entry->val = ent.val; + *entry = swap; + page = find_get_page(&swapper_space, swap.val); } - +#endif return page; } @@ -4737,7 +5352,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - VM_BUG_ON(pmd_trans_huge(*pmd)); + split_huge_page_pmd(walk->mm, pmd); + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4844,8 +5460,7 @@ static void mem_cgroup_clear_mc(void) static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { int ret = 0; struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); @@ -4884,8 +5499,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { mem_cgroup_clear_mc(); } @@ -4899,8 +5513,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + split_huge_page_pmd(walk->mm, pmd); retry: - VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); @@ -4920,8 +5534,8 @@ retry: if (isolate_lru_page(page)) goto put; pc = lookup_page_cgroup(page); - if (!mem_cgroup_move_account(pc, - mc.from, mc.to, false, PAGE_SIZE)) { + if (!mem_cgroup_move_account(page, 1, pc, + mc.from, mc.to, false)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -5003,41 +5617,35 @@ retry: static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { - struct mm_struct *mm; - - if (!mc.to) - /* no need to move charge */ - return; + struct mm_struct *mm = get_task_mm(p); - mm = get_task_mm(p); if (mm) { - mem_cgroup_move_charge(mm); + if (mc.to) + mem_cgroup_move_charge(mm); + put_swap_token(mm); mmput(mm); } - mem_cgroup_clear_mc(); + if (mc.to) + mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { return 0; } static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } #endif @@ -5060,19 +5668,12 @@ struct cgroup_subsys mem_cgroup_subsys = { static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ - if (!(*s) || !strcmp(s, "=1")) + if (!strcmp(s, "1")) really_do_swap_account = 1; - else if (!strcmp(s, "=0")) + else if (!strcmp(s, "0")) really_do_swap_account = 0; return 1; } -__setup("swapaccount", enable_swap_account); +__setup("swapaccount=", enable_swap_account); -static int __init disable_swap_account(char *s) -{ - printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); - enable_swap_account("=0"); - return 1; -} -__setup("noswapaccount", disable_swap_account); #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0207c2f6f8b..2b43ba051ac 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -52,6 +52,8 @@ #include <linux/swapops.h> #include <linux/hugetlb.h> #include <linux/memory_hotplug.h> +#include <linux/mm_inline.h> +#include <linux/kfifo.h> #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -208,7 +210,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, * Don't use force here, it's convenient if the signal * can be temporarily blocked. * This could cause a loop when the user sets SIGBUS - * to SIG_IGN, but hopefully noone will do that? + * to SIG_IGN, but hopefully no one will do that? */ ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ if (ret < 0) @@ -239,7 +241,11 @@ void shake_page(struct page *p, int access) if (access) { int nr; do { - nr = shrink_slab(1000, GFP_KERNEL, 1000); + struct shrink_control shrink = { + .gfp_mask = GFP_KERNEL, + }; + + nr = shrink_slab(&shrink, 1000, 1000); if (page_count(p) == 1) break; } while (nr > 10); @@ -386,10 +392,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; - read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ - goto out; + return; + + read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; @@ -403,9 +410,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, add_to_kill(tsk, page, vma, to_kill, tkc); } } - page_unlock_anon_vma(av); -out: read_unlock(&tasklist_lock); + page_unlock_anon_vma(av); } /* @@ -419,17 +425,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, struct prio_tree_iter iter; struct address_space *mapping = page->mapping; - /* - * A note on the locking order between the two locks. - * We don't rely on this particular order. - * If you have some other code that needs a different order - * feel free to switch them around. Or add a reverse link - * from mm_struct to task_struct, then this could be all - * done without taking tasklist_lock and looping over all tasks. - */ - + mutex_lock(&mapping->i_mmap_mutex); read_lock(&tasklist_lock); - spin_lock(&mapping->i_mmap_lock); for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -449,8 +446,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, add_to_kill(tsk, page, vma, to_kill, tkc); } } - spin_unlock(&mapping->i_mmap_lock); read_unlock(&tasklist_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* @@ -634,7 +631,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * when the page is reread or dropped. If an * application assumes it will always get error on * fsync, but does other operations on the fd before - * and the page is dropped inbetween then the error + * and the page is dropped between then the error * will not be properly reported. * * This can already happen even without hwpoisoned @@ -728,7 +725,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) * The table matches them in order and calls the right handler. * * This is quite tricky because we can access page at any time - * in its live cycle, so all accesses have to be extremly careful. + * in its live cycle, so all accesses have to be extremely careful. * * This is not complete. More states could be added. * For any missing state don't attempt recovery. @@ -945,7 +942,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, collect_procs(ppage, &tokill); if (hpage != ppage) - lock_page_nosync(ppage); + lock_page(ppage); ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) @@ -1038,7 +1035,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * Check "just unpoisoned", "filter hit", and * "race with other subpage." */ - lock_page_nosync(hpage); + lock_page(hpage); if (!PageHWPoison(hpage) || (hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { @@ -1088,7 +1085,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - lock_page_nosync(hpage); + lock_page(hpage); /* * unpoison always clear PG_hwpoison inside page lock @@ -1130,7 +1127,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) /* * Now take care of user space mappings. - * Abort on fail: __remove_from_page_cache() assumes unmapped page. + * Abort on fail: __delete_from_page_cache() assumes unmapped page. */ if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); @@ -1182,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno) __memory_failure(pfn, trapno, 0); } +#define MEMORY_FAILURE_FIFO_ORDER 4 +#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) + +struct memory_failure_entry { + unsigned long pfn; + int trapno; + int flags; +}; + +struct memory_failure_cpu { + DECLARE_KFIFO(fifo, struct memory_failure_entry, + MEMORY_FAILURE_FIFO_SIZE); + spinlock_t lock; + struct work_struct work; +}; + +static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); + +/** + * memory_failure_queue - Schedule handling memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: Flags for memory failure handling + * + * This function is called by the low level hardware error handler + * when it detects hardware memory corruption of a page. It schedules + * the recovering of error page, including dropping pages, killing + * processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Can run in IRQ context. + */ +void memory_failure_queue(unsigned long pfn, int trapno, int flags) +{ + struct memory_failure_cpu *mf_cpu; + unsigned long proc_flags; + struct memory_failure_entry entry = { + .pfn = pfn, + .trapno = trapno, + .flags = flags, + }; + + mf_cpu = &get_cpu_var(memory_failure_cpu); + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + if (kfifo_put(&mf_cpu->fifo, &entry)) + schedule_work_on(smp_processor_id(), &mf_cpu->work); + else + pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", + pfn); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + put_cpu_var(memory_failure_cpu); +} +EXPORT_SYMBOL_GPL(memory_failure_queue); + +static void memory_failure_work_func(struct work_struct *work) +{ + struct memory_failure_cpu *mf_cpu; + struct memory_failure_entry entry = { 0, }; + unsigned long proc_flags; + int gotten; + + mf_cpu = &__get_cpu_var(memory_failure_cpu); + for (;;) { + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + gotten = kfifo_get(&mf_cpu->fifo, &entry); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + if (!gotten) + break; + __memory_failure(entry.pfn, entry.trapno, entry.flags); + } +} + +static int __init memory_failure_init(void) +{ + struct memory_failure_cpu *mf_cpu; + int cpu; + + for_each_possible_cpu(cpu) { + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + spin_lock_init(&mf_cpu->lock); + INIT_KFIFO(mf_cpu->fifo); + INIT_WORK(&mf_cpu->work, memory_failure_work_func); + } + + return 0; +} +core_initcall(memory_failure_init); + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1231,7 +1319,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - lock_page_nosync(page); + lock_page(page); /* * This test is racy because PG_hwpoison is set outside of page lock. * That's acceptable because that won't trigger kernel panic. Instead, @@ -1440,16 +1528,12 @@ int soft_offline_page(struct page *page, int flags) */ ret = invalidate_inode_page(page); unlock_page(page); - /* - * Drop count because page migration doesn't like raised - * counts. The page could get re-allocated, but if it becomes - * LRU the isolation will just fail. * RED-PEN would be better to keep it isolated here, but we * would need to fix isolation locking first. */ - put_page(page); if (ret == 1) { + put_page(page); ret = 0; pr_info("soft_offline: %#lx: invalidated\n", pfn); goto done; @@ -1461,9 +1545,15 @@ int soft_offline_page(struct page *page, int flags) * handles a large number of cases for us. */ ret = isolate_lru_page(page); + /* + * Drop page reference which is came from get_any_page() + * successful isolate_lru_page() already took another one. + */ + put_page(page); if (!ret) { LIST_HEAD(pagelist); - + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, true); @@ -1487,35 +1577,3 @@ done: /* keep elevated page count for bad page */ return ret; } - -/* - * The caller must hold current->mm->mmap_sem in read mode. - */ -int is_hwpoison_address(unsigned long addr) -{ - pgd_t *pgdp; - pud_t pud, *pudp; - pmd_t pmd, *pmdp; - pte_t pte, *ptep; - swp_entry_t entry; - - pgdp = pgd_offset(current->mm, addr); - if (!pgd_present(*pgdp)) - return 0; - pudp = pud_offset(pgdp, addr); - pud = *pudp; - if (!pud_present(pud) || pud_large(pud)) - return 0; - pmdp = pmd_offset(pudp, addr); - pmd = *pmdp; - if (!pmd_present(pmd) || pmd_large(pmd)) - return 0; - ptep = pte_offset_map(pmdp, addr); - pte = *ptep; - pte_unmap(ptep); - if (!is_swap_pte(pte)) - return 0; - entry = pte_to_swp_entry(pte); - return is_hwpoison_entry(entry); -} -EXPORT_SYMBOL_GPL(is_hwpoison_address); diff --git a/mm/memory.c b/mm/memory.c index 5823698c2b7..a56e3ba816b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -182,7 +182,7 @@ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) { __sync_task_rss_stat(task, mm); } -#else +#else /* SPLIT_RSS_COUNTING */ #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) @@ -191,7 +191,205 @@ static void check_sync_rss_stat(struct task_struct *task) { } +#endif /* SPLIT_RSS_COUNTING */ + +#ifdef HAVE_GENERIC_MMU_GATHER + +static int tlb_next_batch(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + batch = tlb->active; + if (batch->next) { + tlb->active = batch->next; + return 1; + } + + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); + if (!batch) + return 0; + + batch->next = NULL; + batch->nr = 0; + batch->max = MAX_GATHER_BATCH; + + tlb->active->next = batch; + tlb->active = batch; + + return 1; +} + +/* tlb_gather_mmu + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @fullmm argument is used when @mm is without + * users and we're going to destroy the full address space (exit/execve). + */ +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) +{ + tlb->mm = mm; + + tlb->fullmm = fullmm; + tlb->need_flush = 0; + tlb->fast_mode = (num_possible_cpus() == 1); + tlb->local.next = NULL; + tlb->local.nr = 0; + tlb->local.max = ARRAY_SIZE(tlb->__pages); + tlb->active = &tlb->local; + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb->batch = NULL; #endif +} + +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + if (!tlb->need_flush) + return; + tlb->need_flush = 0; + tlb_flush(tlb); +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb_table_flush(tlb); +#endif + + if (tlb_fast_mode(tlb)) + return; + + for (batch = &tlb->local; batch; batch = batch->next) { + free_pages_and_swap_cache(batch->pages, batch->nr); + batch->nr = 0; + } + tlb->active = &tlb->local; +} + +/* tlb_finish_mmu + * Called at the end of the shootdown operation to free up any resources + * that were required. + */ +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + struct mmu_gather_batch *batch, *next; + + tlb_flush_mmu(tlb); + + /* keep the page table cache within bounds */ + check_pgt_cache(); + + for (batch = tlb->local.next; batch; batch = next) { + next = batch->next; + free_pages((unsigned long)batch, 0); + } + tlb->local.next = NULL; +} + +/* __tlb_remove_page + * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while + * handling the additional races in SMP caused by other CPUs caching valid + * mappings in their TLBs. Returns the number of free page slots left. + * When out of page slots we must call tlb_flush_mmu(). + */ +int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + struct mmu_gather_batch *batch; + + tlb->need_flush = 1; + + if (tlb_fast_mode(tlb)) { + free_page_and_swap_cache(page); + return 1; /* avoid calling tlb_flush_mmu() */ + } + + batch = tlb->active; + batch->pages[batch->nr++] = page; + if (batch->nr == batch->max) { + if (!tlb_next_batch(tlb)) + return 0; + batch = tlb->active; + } + VM_BUG_ON(batch->nr > batch->max); + + return batch->max - batch->nr; +} + +#endif /* HAVE_GENERIC_MMU_GATHER */ + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + +/* + * See the comment near struct mmu_table_batch. + */ + +static void tlb_remove_table_smp_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +static void tlb_remove_table_one(void *table) +{ + /* + * This isn't an RCU grace period and hence the page-tables cannot be + * assumed to be actually RCU-freed. + * + * It is however sufficient for software page-table walkers that rely on + * IRQ disabling. See the comment near struct mmu_table_batch. + */ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); + __tlb_remove_table(table); +} + +static void tlb_remove_table_rcu(struct rcu_head *head) +{ + struct mmu_table_batch *batch; + int i; + + batch = container_of(head, struct mmu_table_batch, rcu); + + for (i = 0; i < batch->nr; i++) + __tlb_remove_table(batch->tables[i]); + + free_page((unsigned long)batch); +} + +void tlb_table_flush(struct mmu_gather *tlb) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch) { + call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + *batch = NULL; + } +} + +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct mmu_table_batch **batch = &tlb->batch; + + tlb->need_flush = 1; + + /* + * When there's less then two users of this mm there cannot be a + * concurrent page-table walk. + */ + if (atomic_read(&tlb->mm->mm_users) < 2) { + __tlb_remove_table(table); + return; + } + + if (*batch == NULL) { + *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + if (*batch == NULL) { + tlb_remove_table_one(table); + return; + } + (*batch)->nr = 0; + } + (*batch)->tables[(*batch)->nr++] = table; + if ((*batch)->nr == MAX_TABLE_BATCH) + tlb_table_flush(tlb); +} + +#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ /* * If a p?d_bad entry is found while walking page tables, report @@ -533,7 +731,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE); } -static inline int is_cow_mapping(unsigned int flags) +static inline int is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } @@ -909,26 +1107,26 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) + struct zap_details *details) { struct mm_struct *mm = tlb->mm; - pte_t *pte; - spinlock_t *ptl; + int force_flush = 0; int rss[NR_MM_COUNTERS]; + spinlock_t *ptl; + pte_t *start_pte; + pte_t *pte; +again: init_rss_vec(rss); - - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = start_pte; arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; if (pte_none(ptent)) { - (*zap_work)--; continue; } - (*zap_work) -= PAGE_SIZE; - if (pte_present(ptent)) { struct page *page; @@ -974,7 +1172,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - tlb_remove_page(tlb, page); + force_flush = !__tlb_remove_page(tlb, page); + if (force_flush) + break; continue; } /* @@ -995,11 +1195,23 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, print_bad_pte(vma, addr, ptent, NULL); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + } while (pte++, addr += PAGE_SIZE, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap_unlock(start_pte, ptl); + + /* + * mmu_gather ran out of room to batch pages, we break out of + * the PTE lock to avoid doing the potential expensive TLB invalidate + * and page-free while holding it. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu(tlb); + if (addr != end) + goto again; + } return addr; } @@ -1007,7 +1219,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) + struct zap_details *details) { pmd_t *pmd; unsigned long next; @@ -1019,19 +1231,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next-addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); - } else if (zap_huge_pmd(tlb, vma, pmd)) { - (*zap_work)--; + } else if (zap_huge_pmd(tlb, vma, pmd)) continue; - } /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) { - (*zap_work)--; + if (pmd_none_or_clear_bad(pmd)) continue; - } - next = zap_pte_range(tlb, vma, pmd, addr, next, - zap_work, details); - } while (pmd++, addr = next, (addr != end && *zap_work > 0)); + next = zap_pte_range(tlb, vma, pmd, addr, next, details); + cond_resched(); + } while (pmd++, addr = next, addr != end); return addr; } @@ -1039,7 +1247,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) + struct zap_details *details) { pud_t *pud; unsigned long next; @@ -1047,13 +1255,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) { - (*zap_work)--; + if (pud_none_or_clear_bad(pud)) continue; - } - next = zap_pmd_range(tlb, vma, pud, addr, next, - zap_work, details); - } while (pud++, addr = next, (addr != end && *zap_work > 0)); + next = zap_pmd_range(tlb, vma, pud, addr, next, details); + } while (pud++, addr = next, addr != end); return addr; } @@ -1061,7 +1266,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, static unsigned long unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, - long *zap_work, struct zap_details *details) + struct zap_details *details) { pgd_t *pgd; unsigned long next; @@ -1075,29 +1280,19 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) { - (*zap_work)--; + if (pgd_none_or_clear_bad(pgd)) continue; - } - next = zap_pud_range(tlb, vma, pgd, addr, next, - zap_work, details); - } while (pgd++, addr = next, (addr != end && *zap_work > 0)); + next = zap_pud_range(tlb, vma, pgd, addr, next, details); + } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); mem_cgroup_uncharge_end(); return addr; } -#ifdef CONFIG_PREEMPT -# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) -#else -/* No preempt: go for improved straight-line efficiency */ -# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) -#endif - /** * unmap_vmas - unmap a range of memory covered by a list of vma's - * @tlbp: address of the caller's struct mmu_gather + * @tlb: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -1108,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * * Unmap all pages in the vma list. * - * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. - * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. @@ -1121,17 +1312,12 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, +unsigned long unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { - long zap_work = ZAP_BLOCK_SIZE; - unsigned long tlb_start = 0; /* For tlb_finish_mmu */ - int tlb_start_valid = 0; unsigned long start = start_addr; - spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = (*tlbp)->fullmm; struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); @@ -1152,11 +1338,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, untrack_pfn_vma(vma, 0, 0); while (start != end) { - if (!tlb_start_valid) { - tlb_start = start; - tlb_start_valid = 1; - } - if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it @@ -1169,39 +1350,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, * Since no pte has actually been setup, it is * safe to do nothing in this case. */ - if (vma->vm_file) { + if (vma->vm_file) unmap_hugepage_range(vma, start, end, NULL); - zap_work -= (end - start) / - pages_per_huge_page(hstate_vma(vma)); - } start = end; } else - start = unmap_page_range(*tlbp, vma, - start, end, &zap_work, details); - - if (zap_work > 0) { - BUG_ON(start != end); - break; - } - - tlb_finish_mmu(*tlbp, tlb_start, start); - - if (need_resched() || - (i_mmap_lock && spin_needbreak(i_mmap_lock))) { - if (i_mmap_lock) { - *tlbp = NULL; - goto out; - } - cond_resched(); - } - - *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); - tlb_start_valid = 0; - zap_work = ZAP_BLOCK_SIZE; + start = unmap_page_range(tlb, vma, start, end, details); } } -out: + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ } @@ -1217,16 +1374,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; - struct mmu_gather *tlb; + struct mmu_gather tlb; unsigned long end = address + size; unsigned long nr_accounted = 0; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - if (tlb) - tlb_finish_mmu(tlb, address, end); + tlb_finish_mmu(&tlb, address, end); return end; } @@ -1359,7 +1515,7 @@ split_fallthrough: */ mark_page_accessed(page); } - if (flags & FOLL_MLOCK) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -1410,6 +1566,61 @@ no_page_table: return page; } +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return stack_guard_page_start(vma, addr) || + stack_guard_page_end(vma, addr+PAGE_SIZE); +} + +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, @@ -1437,9 +1648,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(tsk, start)) { + if (!vma && in_gate_area(mm, start)) { unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk); pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -1464,10 +1674,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pte_unmap(pte); return i ? : -EFAULT; } + vma = get_gate_vma(mm); if (pages) { struct page *page; - page = vm_normal_page(gate_vma, start, *pte); + page = vm_normal_page(vma, start, *pte); if (!page) { if (!(gup_flags & FOLL_DUMP) && is_zero_pfn(pte_pfn(*pte))) @@ -1481,12 +1692,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, get_page(page); } pte_unmap(pte); - if (vmas) - vmas[i] = gate_vma; - i++; - start += PAGE_SIZE; - nr_pages--; - continue; + goto next_page; } if (!vma || @@ -1516,10 +1722,17 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int ret; unsigned int fault_flags = 0; + /* For mlock, just skip the stack guard page. */ + if (foll_flags & FOLL_MLOCK) { + if (stack_guard_page(vma, start)) + goto next_page; + } if (foll_flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (nonblocking) fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (foll_flags & FOLL_NOWAIT) + fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); ret = handle_mm_fault(mm, vma, start, fault_flags); @@ -1527,19 +1740,30 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - if (ret & - (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| - VM_FAULT_SIGBUS)) + if (ret & (VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (i) + return i; + else if (gup_flags & FOLL_HWPOISON) + return -EHWPOISON; + else + return -EFAULT; + } + if (ret & VM_FAULT_SIGBUS) return i ? i : -EFAULT; BUG(); } - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } if (ret & VM_FAULT_RETRY) { - *nonblocking = 0; + if (nonblocking) + *nonblocking = 0; return i; } @@ -1569,6 +1793,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, flush_anon_page(vma, page, start); flush_dcache_page(page); } +next_page: if (vmas) vmas[i] = vma; i++; @@ -1578,10 +1803,68 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (nr_pages); return i; } +EXPORT_SYMBOL(__get_user_pages); -/** +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* * get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin @@ -2115,10 +2398,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range); * handle_pte_fault chooses page fault handler according to an entry * which was read non-atomically. Before making any commitment, on * those architectures or configurations (e.g. i386 with PAE) which - * might give a mix of unmatched parts, do_swap_page and do_file_page + * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault * must check under lock before unmapping the pte and proceeding * (but do_wp_page is only called after already making such a check; - * and do_anonymous_page and do_no_page can safely check later on). + * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) @@ -2314,7 +2597,7 @@ reuse: * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * - * do_no_page is protected similarly. + * __do_fault is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); @@ -2464,96 +2747,11 @@ unwritable_page: return ret; } -/* - * Helper functions for unmap_mapping_range(). - * - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ - * - * We have to restart searching the prio_tree whenever we drop the lock, - * since the iterator is only valid while the lock is held, and anyway - * a later vma might be split and reinserted earlier while lock dropped. - * - * The list of nonlinear vmas could be handled more efficiently, using - * a placeholder, but handle it in the same way until a need is shown. - * It is important to search the prio_tree before nonlinear list: a vma - * may become nonlinear and be shifted from prio_tree to nonlinear list - * while the lock is dropped; but never shifted from list to prio_tree. - * - * In order to make forward progress despite restarting the search, - * vm_truncate_count is used to mark a vma as now dealt with, so we can - * quickly skip it next time around. Since the prio_tree search only - * shows us those vmas affected by unmapping the range in question, we - * can't efficiently keep all vmas in step with mapping->truncate_count: - * so instead reset them all whenever it wraps back to 0 (then go to 1). - * mapping->truncate_count and vma->vm_truncate_count are protected by - * i_mmap_lock. - * - * In order to make forward progress despite repeatedly restarting some - * large vma, note the restart_addr from unmap_vmas when it breaks out: - * and restart from that address when we reach that vma again. It might - * have been split or merged, shrunk or extended, but never shifted: so - * restart_addr remains valid so long as it remains in the vma's range. - * unmap_mapping_range forces truncate_count to leap over page-aligned - * values so we can save vma's restart_addr in its truncate_count field. - */ -#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) - -static void reset_vma_truncate_counts(struct address_space *mapping) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - vma->vm_truncate_count = 0; - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - vma->vm_truncate_count = 0; -} - -static int unmap_mapping_range_vma(struct vm_area_struct *vma, +static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { - unsigned long restart_addr; - int need_break; - - /* - * files that support invalidating or truncating portions of the - * file from under mmaped areas must have their ->fault function - * return a locked page (and set VM_FAULT_LOCKED in the return). - * This provides synchronisation against concurrent unmapping here. - */ - -again: - restart_addr = vma->vm_truncate_count; - if (is_restart_addr(restart_addr) && start_addr < restart_addr) { - start_addr = restart_addr; - if (start_addr >= end_addr) { - /* Top of vma has been split off since last time */ - vma->vm_truncate_count = details->truncate_count; - return 0; - } - } - - restart_addr = zap_page_range(vma, start_addr, - end_addr - start_addr, details); - need_break = need_resched() || spin_needbreak(details->i_mmap_lock); - - if (restart_addr >= end_addr) { - /* We have now completed this vma: mark it so */ - vma->vm_truncate_count = details->truncate_count; - if (!need_break) - return 0; - } else { - /* Note restart_addr in vma's truncate_count field */ - vma->vm_truncate_count = restart_addr; - if (!need_break) - goto again; - } - - spin_unlock(details->i_mmap_lock); - cond_resched(); - spin_lock(details->i_mmap_lock); - return -EINTR; + zap_page_range(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct prio_tree_root *root, @@ -2563,12 +2761,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root, struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; -restart: vma_prio_tree_foreach(vma, &iter, root, details->first_index, details->last_index) { - /* Skip quickly over those we have already dealt with */ - if (vma->vm_truncate_count == details->truncate_count) - continue; vba = vma->vm_pgoff; vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; @@ -2580,11 +2774,10 @@ restart: if (zea > vea) zea = vea; - if (unmap_mapping_range_vma(vma, + unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, - details) < 0) - goto restart; + details); } } @@ -2599,15 +2792,9 @@ static inline void unmap_mapping_range_list(struct list_head *head, * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */ -restart: list_for_each_entry(vma, head, shared.vm_set.list) { - /* Skip quickly over those we have already dealt with */ - if (vma->vm_truncate_count == details->truncate_count) - continue; details->nonlinear_vma = vma; - if (unmap_mapping_range_vma(vma, vma->vm_start, - vma->vm_end, details) < 0) - goto restart; + unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } } @@ -2646,53 +2833,17 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - details.i_mmap_lock = &mapping->i_mmap_lock; - mutex_lock(&mapping->unmap_mutex); - spin_lock(&mapping->i_mmap_lock); - - /* Protect against endless unmapping loops */ - mapping->truncate_count++; - if (unlikely(is_restart_addr(mapping->truncate_count))) { - if (mapping->truncate_count == 0) - reset_vma_truncate_counts(mapping); - mapping->truncate_count++; - } - details.truncate_count = mapping->truncate_count; + mutex_lock(&mapping->i_mmap_mutex); if (unlikely(!prio_tree_empty(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - spin_unlock(&mapping->i_mmap_lock); - mutex_unlock(&mapping->unmap_mutex); + mutex_unlock(&mapping->i_mmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); -int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) -{ - struct address_space *mapping = inode->i_mapping; - - /* - * If the underlying filesystem is not going to provide - * a way to truncate a range of blocks (punch a hole) - - * we should return failure right now. - */ - if (!inode->i_op->truncate_range) - return -ENOSYS; - - mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); - unmap_mapping_range(mapping, offset, (end - offset), 1); - truncate_inode_pages_range(mapping, offset, end); - unmap_mapping_range(mapping, offset, (end - offset), 1); - inode->i_op->truncate_range(inode, offset, end); - up_write(&inode->i_alloc_sem); - mutex_unlock(&inode->i_mutex); - - return 0; -} - /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -2707,7 +2858,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swp_entry_t entry; pte_t pte; int locked; - struct mem_cgroup *ptr = NULL; + struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; @@ -2747,6 +2898,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -2895,7 +3047,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo if (prev && prev->vm_end == address) return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; - expand_stack(vma, address - PAGE_SIZE); + expand_downwards(vma, address - PAGE_SIZE); } if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { struct vm_area_struct *next = vma->vm_next; @@ -2997,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *page_table; spinlock_t *ptl; struct page *page; + struct page *cow_page; pte_t entry; int anon = 0; - int charged = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; int page_mkwrite = 0; + /* + * If we do COW later, allocate page befor taking lock_page() + * on the file cache page. This will reduce lock holding time. + */ + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!cow_page) + return VM_FAULT_OOM; + + if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { + page_cache_release(cow_page); + return VM_FAULT_OOM; + } + } else + cow_page = NULL; + vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; vmf.flags = flags; @@ -3013,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; + goto uncharge_out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - return VM_FAULT_HWPOISON; + ret = VM_FAULT_HWPOISON; + goto uncharge_out; } /* @@ -3036,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, page = vmf.page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { + page = cow_page; anon = 1; - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; - goto out; - } - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - page_cache_release(page); - goto out; - } - charged = 1; copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -3121,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, page_table); } else { - if (charged) - mem_cgroup_uncharge_page(page); + if (cow_page) + mem_cgroup_uncharge_page(cow_page); if (anon) page_cache_release(page); else @@ -3131,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); -out: if (dirty_page) { struct address_space *mapping = page->mapping; @@ -3161,6 +3318,13 @@ out: unwritable_page: page_cache_release(page); return ret; +uncharge_out: + /* fs's fault handler get error */ + if (cow_page) { + mem_cgroup_uncharge_page(cow_page); + page_cache_release(cow_page); + } + return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, @@ -3286,6 +3450,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); @@ -3322,7 +3487,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) @@ -3439,7 +3604,7 @@ static int __init gate_vma_init(void) __initcall(gate_vma_init); #endif -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef AT_SYSINFO_EHDR return &gate_vma; @@ -3448,7 +3613,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) #endif } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { #ifdef AT_SYSINFO_EHDR if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) @@ -3589,20 +3754,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, #endif /* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages + * Access another process' address space as given in mm. If non-NULL, use the + * given task for page fault accounting. */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) { - struct mm_struct *mm; struct vm_area_struct *vma; void *old_buf = buf; - mm = get_task_mm(tsk); - if (!mm) - return 0; - down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ while (len) { @@ -3619,7 +3779,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in */ #ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); - if (!vma) + if (!vma || vma->vm_start > addr) break; if (vma->vm_ops && vma->vm_ops->access) ret = vma->vm_ops->access(vma, addr, buf, @@ -3651,11 +3811,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in addr += bytes; } up_read(&mm->mmap_sem); - mmput(mm); return buf - old_buf; } +/** + * access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, int write) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = __access_remote_vm(tsk, mm, addr, buf, len, write); + mmput(mm); + + return ret; +} + /* * Print the name of a VMA. */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 321fc7455df..6e7d8b21dbf 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -34,6 +34,17 @@ #include "internal.h" +/* + * online_page_callback contains pointer to current page onlining function. + * Initially it is generic_online_page(). If it is required it could be + * changed by calling set_online_page_callback() for callback registration + * and restore_online_page_callback() for generic callback restore. + */ + +static void generic_online_page(struct page *page); + +static online_page_callback_t online_page_callback = generic_online_page; + DEFINE_MUTEX(mem_hotplug_mutex); void lock_memory_hotplug(void) @@ -361,27 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } EXPORT_SYMBOL_GPL(__remove_pages); -void online_page(struct page *page) +int set_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + lock_memory_hotplug(); + + if (online_page_callback == generic_online_page) { + online_page_callback = callback; + rc = 0; + } + + unlock_memory_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(set_online_page_callback); + +int restore_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + lock_memory_hotplug(); + + if (online_page_callback == callback) { + online_page_callback = generic_online_page; + rc = 0; + } + + unlock_memory_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(restore_online_page_callback); + +void __online_page_set_limits(struct page *page) { unsigned long pfn = page_to_pfn(page); - totalram_pages++; if (pfn >= num_physpages) num_physpages = pfn + 1; +} +EXPORT_SYMBOL_GPL(__online_page_set_limits); + +void __online_page_increment_counters(struct page *page) +{ + totalram_pages++; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) totalhigh_pages++; #endif +} +EXPORT_SYMBOL_GPL(__online_page_increment_counters); -#ifdef CONFIG_FLATMEM - max_mapnr = max(page_to_pfn(page), max_mapnr); -#endif - +void __online_page_free(struct page *page) +{ ClearPageReserved(page); init_page_count(page); __free_page(page); } +EXPORT_SYMBOL_GPL(__online_page_free); + +static void generic_online_page(struct page *page) +{ + __online_page_set_limits(page); + __online_page_increment_counters(page); + __online_page_free(page); +} static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) @@ -392,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); - online_page(page); + (*online_page_callback)(page); onlined_pages++; } *(unsigned long *)arg = onlined_pages; @@ -400,7 +458,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, } -int online_pages(unsigned long pfn, unsigned long nr_pages) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages) { unsigned long onlined_pages = 0; struct zone *zone; @@ -459,8 +517,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone_pcp_update(zone); mutex_unlock(&zonelists_mutex); - setup_per_zone_wmarks(); - calculate_zone_inactive_ratio(zone); + + init_per_zone_wmark_min(); + if (onlined_pages) { kswapd_run(zone_to_nid(zone)); node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); @@ -497,6 +556,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_node(nid, zones_size, start_pfn, zholes_size); + /* + * The node we allocated has no zone fallback lists. For avoiding + * to access not-initialized zonelist, build here. + */ + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + return pgdat; } @@ -518,7 +585,7 @@ int mem_online_node(int nid) lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); - if (pgdat) { + if (!pgdat) { ret = -ENOMEM; goto out; } @@ -705,7 +772,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); - if (!page_count(page)) + if (!get_page_unless_zero(page)) continue; /* * We can skip free pages. And we can only deal with pages on @@ -713,6 +780,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) */ ret = isolate_lru_page(page); if (!ret) { /* Success */ + put_page(page); list_add_tail(&page->lru, &source); move_pages--; inc_zone_page_state(page, NR_ISOLATED_ANON + @@ -724,7 +792,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) pfn); dump_page(page); #endif - /* Becasue we don't have big zone->lock. we should + put_page(page); + /* Because we don't have big zone->lock. we should check this again here. */ if (page_count(page)) { not_managed++; @@ -795,7 +864,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -static int offline_pages(unsigned long start_pfn, +static int __ref offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { unsigned long pfn, nr_pages, expire; @@ -893,8 +962,8 @@ repeat: zone->zone_pgdat->node_present_pages -= offlined_pages; totalram_pages -= offlined_pages; - setup_per_zone_wmarks(); - calculate_zone_inactive_ratio(zone); + init_per_zone_wmark_min(); + if (!node_present_pages(node)) { node_clear_state(node, N_HIGH_MEMORY); kswapd_stop(node); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 49355a970be..8b57173c1dd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,13 +93,13 @@ #include <asm/tlbflush.h> #include <asm/uaccess.h> +#include <linux/random.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ -#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; @@ -457,7 +457,6 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { }, }; -static void gather_stats(struct page *, void *, int pte_dirty); static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); @@ -492,9 +491,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) continue; - if (flags & MPOL_MF_STATS) - gather_stats(page, private, pte_dirty(*pte)); - else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) migrate_page_add(page, private, flags); else break; @@ -993,7 +990,7 @@ int do_migrate_pages(struct mm_struct *mm, * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. - * Otherwise when we finish scannng from_tmp, we at least have the + * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. @@ -1489,7 +1486,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct task_struct *task, +struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; @@ -1524,10 +1521,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, + int nd) { - int nd = numa_node_id(); - switch (policy->mode) { case MPOL_PREFERRED: if (!(policy->flags & MPOL_F_LOCAL)) @@ -1650,6 +1646,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } +/* + * Return the bit number of a random bit set in the nodemask. + * (returns -1 if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = -1; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} + #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) @@ -1679,7 +1690,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, zl = node_zonelist(interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))), gfp_flags); } else { - zl = policy_zonelist(gfp_flags, *mpol); + zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } @@ -1820,7 +1831,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int node) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1836,7 +1847,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, put_mems_allowed(); return page; } - zl = policy_zonelist(gfp, pol); + zl = policy_zonelist(gfp, pol, node); if (unlikely(mpol_needs_cond_ref(pol))) { /* * slow path: ref counted shared policy @@ -1892,7 +1903,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } @@ -1979,8 +1991,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: return nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - return a->v.preferred_node == b->v.preferred_node && - a->flags == b->flags; + return a->v.preferred_node == b->v.preferred_node; default: BUG(); return 0; @@ -2530,159 +2541,3 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) } return p - buffer; } - -struct numa_maps { - unsigned long pages; - unsigned long anon; - unsigned long active; - unsigned long writeback; - unsigned long mapcount_max; - unsigned long dirty; - unsigned long swapcache; - unsigned long node[MAX_NUMNODES]; -}; - -static void gather_stats(struct page *page, void *private, int pte_dirty) -{ - struct numa_maps *md = private; - int count = page_mapcount(page); - - md->pages++; - if (pte_dirty || PageDirty(page)) - md->dirty++; - - if (PageSwapCache(page)) - md->swapcache++; - - if (PageActive(page) || PageUnevictable(page)) - md->active++; - - if (PageWriteback(page)) - md->writeback++; - - if (PageAnon(page)) - md->anon++; - - if (count > md->mapcount_max) - md->mapcount_max = count; - - md->node[page_to_nid(page)]++; -} - -#ifdef CONFIG_HUGETLB_PAGE -static void check_huge_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct numa_maps *md) -{ - unsigned long addr; - struct page *page; - struct hstate *h = hstate_vma(vma); - unsigned long sz = huge_page_size(h); - - for (addr = start; addr < end; addr += sz) { - pte_t *ptep = huge_pte_offset(vma->vm_mm, - addr & huge_page_mask(h)); - pte_t pte; - - if (!ptep) - continue; - - pte = *ptep; - if (pte_none(pte)) - continue; - - page = pte_page(pte); - if (!page) - continue; - - gather_stats(page, md, pte_dirty(*ptep)); - } -} -#else -static inline void check_huge_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct numa_maps *md) -{ -} -#endif - -/* - * Display pages allocated per node and memory policy via /proc. - */ -int show_numa_map(struct seq_file *m, void *v) -{ - struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - struct numa_maps *md; - struct file *file = vma->vm_file; - struct mm_struct *mm = vma->vm_mm; - struct mempolicy *pol; - int n; - char buffer[50]; - - if (!mm) - return 0; - - md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); - if (!md) - return 0; - - pol = get_vma_policy(priv->task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol, 0); - mpol_cond_put(pol); - - seq_printf(m, "%08lx %s", vma->vm_start, buffer); - - if (file) { - seq_printf(m, " file="); - seq_path(m, &file->f_path, "\n\t= "); - } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { - seq_printf(m, " heap"); - } else if (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack) { - seq_printf(m, " stack"); - } - - if (is_vm_hugetlb_page(vma)) { - check_huge_range(vma, vma->vm_start, vma->vm_end, md); - seq_printf(m, " huge"); - } else { - check_pgd_range(vma, vma->vm_start, vma->vm_end, - &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); - } - - if (!md->pages) - goto out; - - if (md->anon) - seq_printf(m," anon=%lu",md->anon); - - if (md->dirty) - seq_printf(m," dirty=%lu",md->dirty); - - if (md->pages != md->anon && md->pages != md->dirty) - seq_printf(m, " mapped=%lu", md->pages); - - if (md->mapcount_max > 1) - seq_printf(m, " mapmax=%lu", md->mapcount_max); - - if (md->swapcache) - seq_printf(m," swapcache=%lu", md->swapcache); - - if (md->active < md->pages && !is_vm_hugetlb_page(vma)) - seq_printf(m," active=%lu", md->active); - - if (md->writeback) - seq_printf(m," writeback=%lu", md->writeback); - - for_each_node_state(n, N_HIGH_MEMORY) - if (md->node[n]) - seq_printf(m, " N%d=%lu", n, md->node[n]); -out: - seq_putc(m, '\n'); - kfree(md); - - if (m->count < m->size) - m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; - return 0; -} diff --git a/mm/migrate.c b/mm/migrate.c index 352de555626..666e4e67741 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - if (PageSwapBacked(page)) { + if (!PageSwapCache(page) && PageSwapBacked(page)) { __dec_zone_page_state(page, NR_SHMEM); __inc_zone_page_state(newpage, NR_SHMEM); } @@ -375,7 +375,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) * redo the accounting that clear_page_dirty_for_io undid, * but we can't use set_page_dirty because that function * is actually a signal that all of the page has become dirty. - * Wheras only part of our page may be dirty. + * Whereas only part of our page may be dirty. */ __set_page_dirty_nobuffers(newpage); } @@ -564,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping, * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache) + int remap_swapcache, bool sync) { struct address_space *mapping; int rc; @@ -586,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) rc = migrate_page(mapping, newpage, page); - else if (mapping->a_ops->migratepage) + else { /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. + * Do not writeback pages if !sync and migratepage is + * not pointing to migrate_page() which is nonblocking + * (swapcache/tmpfs uses migratepage = migrate_page). */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); + if (PageDirty(page) && !sync && + mapping->a_ops->migratepage != migrate_page) + rc = -EBUSY; + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, + newpage, page); + else + rc = fallback_migrate_page(mapping, newpage, page); + } if (rc) { newpage->mapping = NULL; @@ -623,7 +633,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, struct page *newpage = get_new_page(page, private, &result); int remap_swapcache = 1; int charge = 0; - struct mem_cgroup *mem = NULL; + struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; if (!newpage) @@ -641,7 +651,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, rc = -EAGAIN; if (!trylock_page(page)) { - if (!force) + if (!force || !sync) goto move_newpage; /* @@ -678,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, } /* charge against new page */ - charge = mem_cgroup_prepare_migration(page, newpage, &mem); + charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); if (charge == -ENOMEM) { rc = -ENOMEM; goto unlock; @@ -686,7 +696,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); if (PageWriteback(page)) { - if (!force || !sync) + /* + * For !sync, there is no point retrying as the retry loop + * is expected to be too short for PageWriteback to be cleared + */ + if (!sync) { + rc = -EBUSY; + goto uncharge; + } + if (!force) goto uncharge; wait_on_page_writeback(page); } @@ -703,15 +721,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * Only page_lock_anon_vma() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. */ - anon_vma = page_lock_anon_vma(page); + anon_vma = page_get_anon_vma(page); if (anon_vma) { /* - * Take a reference count on the anon_vma if the - * page is mapped so that it is guaranteed to - * exist when the page is remapped later + * Anon page */ - get_anon_vma(anon_vma); - page_unlock_anon_vma(anon_vma); } else if (PageSwapCache(page)) { /* * We cannot be sure that the anon_vma of an unmapped @@ -757,14 +771,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache); + rc = move_to_new_page(newpage, page, remap_swapcache, sync); if (rc && remap_swapcache) remove_migration_ptes(page, page); /* Drop an anon_vma reference if we took one */ if (anon_vma) - drop_anon_vma(anon_vma); + put_anon_vma(anon_vma); uncharge: if (!charge) @@ -839,24 +853,19 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, lock_page(hpage); } - if (PageAnon(hpage)) { - anon_vma = page_lock_anon_vma(hpage); - if (anon_vma) { - get_anon_vma(anon_vma); - page_unlock_anon_vma(anon_vma); - } - } + if (PageAnon(hpage)) + anon_vma = page_get_anon_vma(hpage); try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1); + rc = move_to_new_page(new_hpage, hpage, 1, sync); if (rc) remove_migration_ptes(hpage, hpage); if (anon_vma) - drop_anon_vma(anon_vma); + put_anon_vma(anon_vma); out: unlock_page(hpage); diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d75c7..636a86876ff 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * file will not get a swp_entry_t in its pte, but rather it is like * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. - * - * However when tmpfs moves the page from pagecache and into swapcache, - * it is still in core, but the find_get_page below won't find it. - * No big deal, but make a note of it. */ page = find_get_page(mapping, pgoff); +#ifdef CONFIG_SWAP + /* shmem/tmpfs may return swap: account for swapcache page too. */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap = radix_to_swp_entry(page); + page = find_get_page(&swapper_space, swap.val); + } +#endif if (page) { present = PageUptodate(page); page_cache_release(page); diff --git a/mm/mlock.c b/mm/mlock.c index c3924c7f00b..048260c4e02 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page) } } -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return (vma->vm_flags & VM_GROWSDOWN) && - (vma->vm_start == addr) && - !vma_stack_continue(vma->vm_prev, addr); -} - /** * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma @@ -169,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH; + gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -185,15 +178,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) gup_flags |= FOLL_FORCE; - if (vma->vm_flags & VM_LOCKED) - gup_flags |= FOLL_MLOCK; - - /* We don't try to access the guard page of a stack vma */ - if (stack_guard_page(vma, start)) { - addr += PAGE_SIZE; - nr_pages--; - } - return __get_user_pages(current, mm, addr, nr_pages, gup_flags, NULL, NULL, nonblocking); } @@ -237,7 +221,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current))) { + vma == get_gate_vma(current->mm))) { __mlock_vma_pages_range(vma, start, end, NULL); @@ -323,16 +307,16 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, unsigned int newflags) + unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; int nr_pages; int ret = 0; - int lock = newflags & VM_LOCKED; + int lock = !!(newflags & VM_LOCKED); if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || - is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) goto out; /* don't set VM_LOCKED, don't count */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -401,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on) prev = vma; for (nstart = start ; ; ) { - unsigned int newflags; + vm_flags_t newflags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ @@ -540,7 +524,7 @@ static int do_mlockall(int flags) goto out; for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { - unsigned int newflags; + vm_flags_t newflags; newflags = vma->vm_flags | VM_LOCKED; if (!(flags & MCL_CURRENT)) diff --git a/mm/mmap.c b/mm/mmap.c index 2ec8eb5a9cd..a65efd4db3e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -84,10 +84,14 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) } EXPORT_SYMBOL(vm_get_page_prot); -int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio = 50; /* default is 50% */ +int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ +int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; -struct percpu_counter vm_committed_as; +/* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. + */ +struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; /* * Check that a process has enough memory to allocate a new virtual @@ -118,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); - free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* @@ -132,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free += global_page_state(NR_SLAB_RECLAIMABLE); /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - - /* * Leave reserved pages. The pages are not for anonymous pages. */ - if (n <= totalreserve_pages) + if (free <= totalreserve_pages) goto error; else - n -= totalreserve_pages; + free -= totalreserve_pages; /* * Leave the last 3% for root */ if (!cap_sys_admin) - n -= n / 32; - free += n; + free -= free / 32; if (free > pages) return 0; @@ -190,7 +186,7 @@ error: } /* - * Requires inode->i_mapping->i_mmap_lock + * Requires inode->i_mapping->i_mmap_mutex */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -218,9 +214,9 @@ void unlink_file_vma(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); __remove_shared_vm_struct(vma, file, mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } } @@ -259,7 +255,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ - if (mm->start_brk > PAGE_ALIGN(mm->end_data)) + if (current->brk_randomized) min_brk = mm->start_brk; else min_brk = mm->end_data; @@ -394,29 +390,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, return vma; } -static inline void -__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node *rb_parent) -{ - struct vm_area_struct *next; - - vma->vm_prev = prev; - if (prev) { - next = prev->vm_next; - prev->vm_next = vma; - } else { - mm->mmap = vma; - if (rb_parent) - next = rb_entry(rb_parent, - struct vm_area_struct, vm_rb); - else - next = NULL; - } - vma->vm_next = next; - if (next) - next->vm_prev = vma; -} - void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, struct rb_node **rb_link, struct rb_node *rb_parent) { @@ -464,16 +437,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_file) mapping = vma->vm_file->f_mapping; - if (mapping) { - spin_lock(&mapping->i_mmap_lock); - vma->vm_truncate_count = mapping->truncate_count; - } + if (mapping) + mutex_lock(&mapping->i_mmap_mutex); __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); mm->map_count++; validate_mm(mm); @@ -576,17 +547,8 @@ again: remove_next = 1 + (end > next->vm_end); mapping = file->f_mapping; if (!(vma->vm_flags & VM_NONLINEAR)) root = &mapping->i_mmap; - spin_lock(&mapping->i_mmap_lock); - if (importer && - vma->vm_truncate_count != next->vm_truncate_count) { - /* - * unmap_mapping_range might be in progress: - * ensure that the expanding vma is rescanned. - */ - importer->vm_truncate_count = 0; - } + mutex_lock(&mapping->i_mmap_mutex); if (insert) { - insert->vm_truncate_count = vma->vm_truncate_count; /* * Put into prio_tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page @@ -605,7 +567,7 @@ again: remove_next = 1 + (end > next->vm_end); * lock may be shared between many sibling processes. Skipping * the lock for brk adjustments makes a difference sometimes. */ - if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { + if (vma->anon_vma && (importer || start != vma->vm_start)) { anon_vma = vma->anon_vma; anon_vma_lock(anon_vma); } @@ -652,7 +614,7 @@ again: remove_next = 1 + (end > next->vm_end); if (anon_vma) anon_vma_unlock(anon_vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); if (remove_next) { if (file) { @@ -699,9 +661,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, } static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2) + struct anon_vma *anon_vma2, + struct vm_area_struct *vma) { - return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); + /* + * The list_is_singular() test is to avoid merging VMA cloned from + * parents. This can improve scalability caused by anon_vma lock. + */ + if ((!anon_vma1 || !anon_vma2) && (!vma || + list_is_singular(&vma->anon_vma_chain))) + return 1; + return anon_vma1 == anon_vma2; } /* @@ -720,7 +690,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) { if (is_mergeable_vma(vma, file, vm_flags) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; } @@ -739,7 +709,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) { if (is_mergeable_vma(vma, file, vm_flags) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; if (vma->vm_pgoff + vm_pglen == vm_pgoff) @@ -817,7 +787,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen) && is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma)) { + next->anon_vma, NULL)) { /* cases 1, 6 */ err = vma_adjust(prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL); @@ -928,14 +898,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) if (anon_vma) return anon_vma; try_prev: - /* - * It is potentially slow to have to call find_vma_prev here. - * But it's only on the first write fault on the vma, not - * every time, and we could devise a way to avoid it later - * (e.g. stash info in next's anon_vma_node when assigning - * an anon_vma, or when trying vma_merge). Another time. - */ - BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma); + near = vma->vm_prev; if (!near) goto none; @@ -982,7 +945,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, { struct mm_struct * mm = current->mm; struct inode *inode; - unsigned int vm_flags; + vm_flags_t vm_flags; int error; unsigned long reqprot = prot; @@ -1187,7 +1150,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) */ int vma_wants_writenotify(struct vm_area_struct *vma) { - unsigned int vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; /* If it was private or non-writable, the write bit is already clear */ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) @@ -1215,7 +1178,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(struct file *file, unsigned int vm_flags) +static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM @@ -1229,7 +1192,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags) unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff) + vm_flags_t vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1767,10 +1730,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; - error = acct_stack_growth(vma, size, grow); - if (!error) { - vma->vm_end = address; - perf_event_mmap(vma); + error = -ENOMEM; + if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + vma->vm_end = address; + perf_event_mmap(vma); + } } } vma_unlock_anon_vma(vma); @@ -1782,7 +1748,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -static int expand_downwards(struct vm_area_struct *vma, +int expand_downwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1814,11 +1780,14 @@ static int expand_downwards(struct vm_area_struct *vma, size = vma->vm_end - address; grow = (vma->vm_start - address) >> PAGE_SHIFT; - error = acct_stack_growth(vma, size, grow); - if (!error) { - vma->vm_start = address; - vma->vm_pgoff -= grow; - perf_event_mmap(vma); + error = -ENOMEM; + if (grow <= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + vma->vm_start = address; + vma->vm_pgoff -= grow; + perf_event_mmap(vma); + } } } vma_unlock_anon_vma(vma); @@ -1826,11 +1795,6 @@ static int expand_downwards(struct vm_area_struct *vma, return error; } -int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address) -{ - return expand_downwards(vma, address); -} - #ifdef CONFIG_STACK_GROWSUP int expand_stack(struct vm_area_struct *vma, unsigned long address) { @@ -1913,17 +1877,17 @@ static void unmap_region(struct mm_struct *mm, unsigned long start, unsigned long end) { struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; - struct mmu_gather *tlb; + struct mmu_gather tlb; unsigned long nr_accounted = 0; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, - next? next->vm_start: 0); - tlb_finish_mmu(tlb, start, end); + free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + next ? next->vm_start : 0); + tlb_finish_mmu(&tlb, start, end); } /* @@ -2065,9 +2029,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) return -EINVAL; /* Find the first overlapping VMA */ - vma = find_vma_prev(mm, start, &prev); + vma = find_vma(mm, start); if (!vma) return 0; + prev = vma->vm_prev; /* we have start < vma->vm_end */ /* if it doesn't overlap, we have nothing.. */ @@ -2265,7 +2230,7 @@ EXPORT_SYMBOL(do_brk); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { - struct mmu_gather *tlb; + struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; unsigned long end; @@ -2290,14 +2255,14 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb = tlb_gather_mmu(mm, 1); + tlb_gather_mmu(&tlb, mm, 1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); - tlb_finish_mmu(tlb, 0, end); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + tlb_finish_mmu(&tlb, 0, end); /* * Walk the list again, actually closing and freeing it, @@ -2311,7 +2276,7 @@ void exit_mmap(struct mm_struct *mm) /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_lock is taken here. + * then i_mmap_mutex is taken here. */ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { @@ -2523,15 +2488,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem); + mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); /* * We can safely modify head.next after taking the - * anon_vma->root->lock. If some other vma in this mm shares + * anon_vma->root->mutex. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->root->lock. + * anon_vma->root->mutex. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->head.next)) @@ -2553,7 +2518,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); + mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); } } @@ -2580,7 +2545,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * vma in this mm is backed by the same anon_vma or address_space. * * We can take all the locks in random order because the VM code - * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never + * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never * takes more than one of them in a row. Secondly we're protected * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. * @@ -2636,7 +2601,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->root->lock. + * anon_vma->root->mutex. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->head.next)) @@ -2652,7 +2617,7 @@ static void vm_unlock_mapping(struct address_space *mapping) * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); diff --git a/mm/mremap.c b/mm/mremap.c index 1de98d492dd..506fa44403d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -93,8 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; - spin_lock(&mapping->i_mmap_lock); - new_vma->vm_truncate_count = 0; + mutex_lock(&mapping->i_mmap_mutex); } /* @@ -123,7 +122,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } @@ -277,9 +276,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (old_len > vma->vm_end - addr) goto Efault; - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { - if (new_len > old_len) + /* Need to be careful about a growing mapping */ + if (new_len > old_len) { + unsigned long pgoff; + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) goto Efault; + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + goto Einval; } if (vma->vm_flags & VM_LOCKED) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c new file mode 100644 index 00000000000..6e93dc7f258 --- /dev/null +++ b/mm/nobootmem.c @@ -0,0 +1,404 @@ +/* + * bootmem - A boot-time physical memory allocator and configurator + * + * Copyright (C) 1999 Ingo Molnar + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner + * + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). + */ +#include <linux/init.h> +#include <linux/pfn.h> +#include <linux/slab.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/kmemleak.h> +#include <linux/range.h> +#include <linux/memblock.h> + +#include <asm/bug.h> +#include <asm/io.h> +#include <asm/processor.h> + +#include "internal.h" + +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + void *ptr; + u64 addr; + + if (limit > memblock.current_limit) + limit = memblock.current_limit; + + addr = find_memory_core_early(nid, size, align, goal, limit); + + if (addr == MEMBLOCK_ERROR) + return NULL; + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; +} + +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} + +/** + * free_all_bootmem_node - release a node's free pages to the buddy allocator + * @pgdat: node to be released + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +{ + register_page_bootmem_info_node(pgdat); + + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +} + +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem(void) +{ + /* + * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + * Use MAX_NUMNODES will make sure all ranges in early_node_map[] + * will be used instead of only Node0 related + */ + return free_all_memory_core_early(MAX_NUMNODES); +} + +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must reside completely on the specified node. + */ +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + kmemleak_free_part(__va(physaddr), size); + memblock_x86_free_range(physaddr, physaddr + size); +} + +/** + * free_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must be contiguous but may span node boundaries. + */ +void __init free_bootmem(unsigned long addr, unsigned long size) +{ + kmemleak_free_part(__va(addr), size); + memblock_x86_free_range(addr, addr + size); +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +} + +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem_nopanic(size, align, goal, limit); +} + +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); + + if (mem) + return mem; + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem(size, align, goal, limit); +} + +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, -1ULL); +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return __alloc_bootmem_node(pgdat, size, align, goal); +} + +#ifdef CONFIG_SPARSEMEM +/** + * alloc_bootmem_section - allocate boot memory from a specific section + * @size: size of the request in bytes + * @section_nr: sparse map section to allocate from + * + * Return NULL on failure. + */ +void * __init alloc_bootmem_section(unsigned long size, + unsigned long section_nr) +{ + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +} +#endif + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_bootmem_nopanic(size, align, goal); +} + +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +} + +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +} diff --git a/mm/nommu.c b/mm/nommu.c index f59e1424d3d..4358032566e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -22,7 +22,6 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/tracehook.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/mount.h> @@ -680,9 +679,9 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags) */ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma, **pp, *next; + struct vm_area_struct *pvma, *prev; struct address_space *mapping; - struct rb_node **p, *parent; + struct rb_node **p, *parent, *rb_prev; kenter(",%p", vma); @@ -703,7 +702,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) } /* add the VMA to the tree */ - parent = NULL; + parent = rb_prev = NULL; p = &mm->mm_rb.rb_node; while (*p) { parent = *p; @@ -713,17 +712,20 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) * (the latter is necessary as we may get identical VMAs) */ if (vma->vm_start < pvma->vm_start) p = &(*p)->rb_left; - else if (vma->vm_start > pvma->vm_start) + else if (vma->vm_start > pvma->vm_start) { + rb_prev = parent; p = &(*p)->rb_right; - else if (vma->vm_end < pvma->vm_end) + } else if (vma->vm_end < pvma->vm_end) p = &(*p)->rb_left; - else if (vma->vm_end > pvma->vm_end) + else if (vma->vm_end > pvma->vm_end) { + rb_prev = parent; p = &(*p)->rb_right; - else if (vma < pvma) + } else if (vma < pvma) p = &(*p)->rb_left; - else if (vma > pvma) + else if (vma > pvma) { + rb_prev = parent; p = &(*p)->rb_right; - else + } else BUG(); } @@ -731,20 +733,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) rb_insert_color(&vma->vm_rb, &mm->mm_rb); /* add VMA to the VMA list also */ - for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { - if (pvma->vm_start > vma->vm_start) - break; - if (pvma->vm_start < vma->vm_start) - continue; - if (pvma->vm_end < vma->vm_end) - break; - } + prev = NULL; + if (rb_prev) + prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - next = *pp; - *pp = vma; - vma->vm_next = next; - if (next) - next->vm_prev = vma; + __vma_link_list(mm, vma, prev, parent); } /* @@ -752,7 +745,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { - struct vm_area_struct **pp; struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; @@ -775,12 +767,14 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) /* remove from the MM's tree and list */ rb_erase(&vma->vm_rb, &mm->mm_rb); - for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { - if (*pp == vma) { - *pp = vma->vm_next; - break; - } - } + + if (vma->vm_prev) + vma->vm_prev->vm_next = vma->vm_next; + else + mm->mmap = vma->vm_next; + + if (vma->vm_next) + vma->vm_next->vm_prev = vma->vm_prev; vma->vm_mm = NULL; } @@ -809,17 +803,15 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; - struct rb_node *n = mm->mm_rb.rb_node; /* check the cache first */ vma = mm->mmap_cache; if (vma && vma->vm_start <= addr && vma->vm_end > addr) return vma; - /* trawl the tree (there may be multiple mappings in which addr + /* trawl the list (there may be multiple mappings in which addr * resides) */ - for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { - vma = rb_entry(n, struct vm_area_struct, vm_rb); + for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->vm_start > addr) return NULL; if (vma->vm_end > addr) { @@ -859,7 +851,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long len) { struct vm_area_struct *vma; - struct rb_node *n = mm->mm_rb.rb_node; unsigned long end = addr + len; /* check the cache first */ @@ -867,10 +858,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma && vma->vm_start == addr && vma->vm_end == end) return vma; - /* trawl the tree (there may be multiple mappings in which addr + /* trawl the list (there may be multiple mappings in which addr * resides) */ - for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { - vma = rb_entry(n, struct vm_area_struct, vm_rb); + for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->vm_start < addr) continue; if (vma->vm_start > addr) @@ -1096,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file, * it's being traced - otherwise breakpoints set in it may interfere * with another untraced process */ - if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) + if ((flags & MAP_PRIVATE) && current->ptrace) vm_flags &= ~VM_MAYSHARE; return vm_flags; @@ -1133,7 +1123,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long capabilities) { struct page *pages; - unsigned long total, point, n, rlen; + unsigned long total, point, n; void *base; int ret, order; @@ -1157,13 +1147,12 @@ static int do_mmap_private(struct vm_area_struct *vma, * make a private copy of the data and map that instead */ } - rlen = PAGE_ALIGN(len); /* allocate some memory to hold the mapping * - note that this may not return a page-aligned address if the object * we're allocating is smaller than a page */ - order = get_order(rlen); + order = get_order(len); kdebug("alloc order %d for %lx", order, len); pages = alloc_pages(GFP_KERNEL, order); @@ -1173,7 +1162,7 @@ static int do_mmap_private(struct vm_area_struct *vma, total = 1 << order; atomic_long_add(total, &mmap_pages_allocated); - point = rlen >> PAGE_SHIFT; + point = len >> PAGE_SHIFT; /* we allocated a power-of-2 sized page set, so we may want to trim off * the excess */ @@ -1195,7 +1184,7 @@ static int do_mmap_private(struct vm_area_struct *vma, base = page_address(pages); region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; region->vm_start = (unsigned long) base; - region->vm_end = region->vm_start + rlen; + region->vm_end = region->vm_start + len; region->vm_top = region->vm_start + (total << PAGE_SHIFT); vma->vm_start = region->vm_start; @@ -1211,22 +1200,22 @@ static int do_mmap_private(struct vm_area_struct *vma, old_fs = get_fs(); set_fs(KERNEL_DS); - ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); + ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); set_fs(old_fs); if (ret < 0) goto error_free; /* clear the last little bit */ - if (ret < rlen) - memset(base + ret, 0, rlen - ret); + if (ret < len) + memset(base + ret, 0, len - ret); } return 0; error_free: - free_page_series(region->vm_start, region->vm_end); + free_page_series(region->vm_start, region->vm_top); region->vm_start = vma->vm_start = 0; region->vm_end = vma->vm_end = 0; region->vm_top = 0; @@ -1235,7 +1224,7 @@ error_free: enomem: printk("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); - show_free_areas(); + show_free_areas(0); return -ENOMEM; } @@ -1268,6 +1257,7 @@ unsigned long do_mmap_pgoff(struct file *file, /* we ignore the address hint */ addr = 0; + len = PAGE_ALIGN(len); /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ @@ -1385,15 +1375,15 @@ unsigned long do_mmap_pgoff(struct file *file, if (capabilities & BDI_CAP_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); - if (IS_ERR((void *) addr)) { + if (IS_ERR_VALUE(addr)) { ret = addr; - if (ret != (unsigned long) -ENOSYS) + if (ret != -ENOSYS) goto error_just_free; /* the driver refused to tell us where to site * the mapping so we'll have to attempt to copy * it */ - ret = (unsigned long) -ENODEV; + ret = -ENODEV; if (!(capabilities & BDI_CAP_MAP_COPY)) goto error_just_free; @@ -1468,14 +1458,14 @@ error_getting_vma: printk(KERN_WARNING "Allocation of vma for %lu byte allocation" " from process %d failed\n", len, current->pid); - show_free_areas(); + show_free_areas(0); return -ENOMEM; error_getting_region: printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" " from process %d failed\n", len, current->pid); - show_free_areas(); + show_free_areas(0); return -ENOMEM; } EXPORT_SYMBOL(do_mmap_pgoff); @@ -1644,15 +1634,17 @@ static int shrink_vma(struct mm_struct *mm, int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) { struct vm_area_struct *vma; - struct rb_node *rb; - unsigned long end = start + len; + unsigned long end; int ret; kenter(",%lx,%zx", start, len); + len = PAGE_ALIGN(len); if (len == 0) return -EINVAL; + end = start + len; + /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { @@ -1677,9 +1669,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) } if (end == vma->vm_end) goto erase_whole_vma; - rb = rb_next(&vma->vm_rb); - vma = rb_entry(rb, struct vm_area_struct, vm_rb); - } while (rb); + vma = vma->vm_next; + } while (vma); kleave(" = -EINVAL [split file]"); return -EINVAL; } else { @@ -1773,6 +1764,8 @@ unsigned long do_mremap(unsigned long addr, struct vm_area_struct *vma; /* insanity checks first */ + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); if (old_len == 0 || new_len == 0) return (unsigned long) -EINVAL; @@ -1819,10 +1812,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, return NULL; } -int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, - unsigned long to, unsigned long size, pgprot_t prot) +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { - vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; + if (addr != (pfn << PAGE_SHIFT)) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; return 0; } EXPORT_SYMBOL(remap_pfn_range); @@ -1842,10 +1838,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} - unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -1892,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); - free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* @@ -1906,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free += global_page_state(NR_SLAB_RECLAIMABLE); /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - - /* * Leave reserved pages. The pages are not for anonymous pages. */ - if (n <= totalreserve_pages) + if (free <= totalreserve_pages) goto error; else - n -= totalreserve_pages; + free -= totalreserve_pages; /* * Leave the last 3% for root */ if (!cap_sys_admin) - n -= n / 32; - free += n; + free -= free / 32; if (free > pages) return 0; @@ -1963,7 +1947,7 @@ error: return -ENOMEM; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } @@ -1975,21 +1959,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -/* - * Access another process' address space. - * - source/target buffer must be kernel space - */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) { struct vm_area_struct *vma; - struct mm_struct *mm; - - if (addr + len < addr) - return 0; - - mm = get_task_mm(tsk); - if (!mm) - return 0; down_read(&mm->mmap_sem); @@ -2014,6 +1987,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in } up_read(&mm->mmap_sem); + + return len; +} + +/** + * @access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + +/* + * Access another process' address space. + * - source/target buffer must be kernel space + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + + if (addr + len < addr) + return 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + len = __access_remote_vm(tsk, mm, addr, buf, len, write); + mmput(mm); return len; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7dcca55ede7..626303b52f3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -31,12 +31,40 @@ #include <linux/memcontrol.h> #include <linux/mempolicy.h> #include <linux/security.h> +#include <linux/ptrace.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); +/** + * test_set_oom_score_adj() - set current's oom_score_adj and return old value + * @new_val: new oom_score_adj value + * + * Sets the oom_score_adj value for current to @new_val with proper + * synchronization and returns the old value. Usually used to temporarily + * set a value, save the old value in the caller, and then reinstate it later. + */ +int test_set_oom_score_adj(int new_val) +{ + struct sighand_struct *sighand = current->sighand; + int old_val; + + spin_lock_irq(&sighand->siglock); + old_val = current->signal->oom_score_adj; + if (new_val != old_val) { + if (new_val == OOM_SCORE_ADJ_MIN) + atomic_inc(¤t->mm->oom_disable_count); + else if (old_val == OOM_SCORE_ADJ_MIN) + atomic_dec(¤t->mm->oom_disable_count); + current->signal->oom_score_adj = new_val; + } + spin_unlock_irq(&sighand->siglock); + + return old_val; +} + #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill @@ -83,24 +111,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, #endif /* CONFIG_NUMA */ /* - * If this is a system OOM (not a memcg OOM) and the task selected to be - * killed is not already running at high (RT) priorities, speed up the - * recovery by boosting the dying task to the lowest FIFO priority. - * That helps with the recovery and avoids interfering with RT tasks. - */ -static void boost_dying_task_prio(struct task_struct *p, - struct mem_cgroup *mem) -{ - struct sched_param param = { .sched_priority = 1 }; - - if (mem) - return; - - if (!rt_task(p)) - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); -} - -/* * The process p may have detached its own ->mm while exiting or through * use_mm(), but one or more of its subthreads may still have a valid * pointer. Return p, or any of its subthreads with a valid ->mm, with @@ -172,15 +182,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, } /* - * When the PF_OOM_ORIGIN bit is set, it indicates the task should have - * priority for oom killing. - */ - if (p->flags & PF_OOM_ORIGIN) { - task_unlock(p); - return 1000; - } - - /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. */ @@ -189,10 +190,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, /* * The baseline for the badness score is the proportion of RAM that each - * task's rss and swap space use. + * task's rss, pagetable and swap space use. */ - points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / - totalpages; + points = get_mm_rss(p->mm) + p->mm->nr_ptes; + points += get_mm_counter(p->mm, MM_SWAPENTS); + + points *= 1000; + points /= totalpages; task_unlock(p); /* @@ -292,13 +296,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long totalpages, struct mem_cgroup *mem, const nodemask_t *nodemask) { - struct task_struct *p; + struct task_struct *g, *p; struct task_struct *chosen = NULL; *ppoints = 0; - for_each_process(p) { + do_each_thread(g, p) { unsigned int points; + if (p->exit_state) + continue; if (oom_unkillable_task(p, mem, nodemask)) continue; @@ -313,23 +319,31 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, */ if (test_tsk_thread_flag(p, TIF_MEMDIE)) return ERR_PTR(-1UL); + if (!p->mm) + continue; - /* - * This is in the process of releasing memory so wait for it - * to finish before killing some other task by mistake. - * - * However, if p is the current task, we allow the 'kill' to - * go ahead if it is exiting: this will simply set TIF_MEMDIE, - * which will allow it to gain access to memory reserves in - * the process of exiting and releasing its resources. - * Otherwise we could get an easy OOM deadlock. - */ - if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { - if (p != current) - return ERR_PTR(-1UL); - - chosen = p; - *ppoints = 1000; + if (p->flags & PF_EXITING) { + /* + * If p is the current task and is in the process of + * releasing memory, we allow the "kill" to set + * TIF_MEMDIE, which will allow it to gain access to + * memory reserves. Otherwise, it may stall forever. + * + * The loop isn't broken here, however, in case other + * threads are found to have already been oom killed. + */ + if (p == current) { + chosen = p; + *ppoints = 1000; + } else { + /* + * If this task is not being ptraced on exit, + * then wait for it to finish before killing + * some other task unnecessarily. + */ + if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) + return ERR_PTR(-1UL); + } } points = oom_badness(p, mem, nodemask, totalpages); @@ -337,7 +351,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, chosen = p; *ppoints = points; } - } + } while_each_thread(g, p); return chosen; } @@ -396,7 +410,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(current); dump_stack(); mem_cgroup_print_oom_info(mem, p); - show_mem(); + show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(mem, nodemask); } @@ -442,13 +456,6 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); - /* - * We give our sacrificial lamb high priority and access to - * all the memory it needs. That way it should be able to - * exit() and clear out its resources quickly... - */ - boost_dying_task_prio(p, mem); - return 0; } #undef K @@ -472,7 +479,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ if (p->flags & PF_EXITING) { set_tsk_thread_flag(p, TIF_MEMDIE); - boost_dying_task_prio(p, mem); return 0; } @@ -483,7 +489,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If any of p's children has a different mm and is eligible for kill, - * the one with the highest badness() score is sacrificed for its + * the one with the highest oom_badness() score is sacrificed for its * parent. This attempts to lose the minimal amount of work done while * still freeing memory. */ @@ -491,6 +497,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; + if (child->mm == p->mm) + continue; /* * oom_badness() returns 0 if the thread is unkillable */ @@ -537,6 +545,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) unsigned int points = 0; struct task_struct *p; + /* + * If current has a pending SIGKILL, then automatically select it. The + * goal is to allow it to allocate so that it may quickly exit and free + * its memory. + */ + if (fatal_signal_pending(current)) { + set_thread_flag(TIF_MEMDIE); + return; + } + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; read_lock(&tasklist_lock); @@ -689,7 +707,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ if (fatal_signal_pending(current)) { set_thread_flag(TIF_MEMDIE); - boost_dying_task_prio(current, NULL); return; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2cb01f6ec5d..d1960744f88 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -37,6 +37,16 @@ #include <trace/events/writeback.h> /* + * Sleep at most 200ms at a time in balance_dirty_pages(). + */ +#define MAX_PAUSE max(HZ/5, 1) + +/* + * Estimate write bandwidth at 200ms intervals. + */ +#define BANDWIDTH_INTERVAL max(HZ/5, 1) + +/* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ @@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ +unsigned long global_dirty_limit; /* * Scale the writeback cache size proportional to the relative writeout speeds. @@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, */ static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) { + __inc_bdi_stat(bdi, BDI_WRITTEN); __prop_inc_percpu_max(&vm_completions, &bdi->completions, bdi->max_prop_frac); } @@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk) static void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { - if (bdi_cap_writeback_dirty(bdi)) { - prop_fraction_percpu(&vm_completions, &bdi->completions, + prop_fraction_percpu(&vm_completions, &bdi->completions, numerator, denominator); - } else { - *numerator = 0; - *denominator = 1; - } } static inline void task_dirties_fraction(struct task_struct *tsk, @@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk, * effectively curb the growth of dirty pages. Light dirtiers with high enough * dirty threshold may never get throttled. */ +#define TASK_LIMIT_FRACTION 8 static unsigned long task_dirty_limit(struct task_struct *tsk, unsigned long bdi_dirty) { long numerator, denominator; unsigned long dirty = bdi_dirty; - u64 inv = dirty >> 3; + u64 inv = dirty / TASK_LIMIT_FRACTION; task_dirties_fraction(tsk, &numerator, &denominator); inv *= numerator; @@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk, return max(dirty, bdi_dirty/2); } +/* Minimum limit for any task */ +static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) +{ + return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; +} + /* * */ @@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } +static unsigned long hard_dirty_limit(unsigned long thresh) +{ + return max(thresh, global_dirty_limit); +} + /* * global_dirty_limits - background-writeback and dirty-throttling thresholds * @@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) } *pbackground = background; *pdirty = dirty; + trace_global_dirty_state(background, dirty); } -/* +/** * bdi_dirty_limit - @bdi's share of dirty throttling threshold + * @bdi: the backing_dev_info to query + * @dirty: global dirty limit in pages + * + * Returns @bdi's dirty limit in pages. The term "dirty" in the context of + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * And the "limit" in the name is not seriously taken as hard limit in + * balance_dirty_pages(). * - * Allocate high/low dirty limits to fast/slow devices, in order to prevent + * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * @@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) return bdi_dirty; } +static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long elapsed, + unsigned long written) +{ + const unsigned long period = roundup_pow_of_two(3 * HZ); + unsigned long avg = bdi->avg_write_bandwidth; + unsigned long old = bdi->write_bandwidth; + u64 bw; + + /* + * bw = written * HZ / elapsed + * + * bw * elapsed + write_bandwidth * (period - elapsed) + * write_bandwidth = --------------------------------------------------- + * period + */ + bw = written - bdi->written_stamp; + bw *= HZ; + if (unlikely(elapsed > period)) { + do_div(bw, elapsed); + avg = bw; + goto out; + } + bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw >>= ilog2(period); + + /* + * one more level of smoothing, for filtering out sudden spikes + */ + if (avg > old && old >= (unsigned long)bw) + avg -= (avg - old) >> 3; + + if (avg < old && old <= (unsigned long)bw) + avg += (old - avg) >> 3; + +out: + bdi->write_bandwidth = bw; + bdi->avg_write_bandwidth = avg; +} + +/* + * The global dirtyable memory and dirty threshold could be suddenly knocked + * down by a large amount (eg. on the startup of KVM in a swapless system). + * This may throw the system into deep dirty exceeded state and throttle + * heavy/light dirtiers alike. To retain good responsiveness, maintain + * global_dirty_limit for tracking slowly down to the knocked down dirty + * threshold. + */ +static void update_dirty_limit(unsigned long thresh, unsigned long dirty) +{ + unsigned long limit = global_dirty_limit; + + /* + * Follow up in one step. + */ + if (limit < thresh) { + limit = thresh; + goto update; + } + + /* + * Follow down slowly. Use the higher one as the target, because thresh + * may drop below dirty. This is exactly the reason to introduce + * global_dirty_limit which is guaranteed to lie above the dirty pages. + */ + thresh = max(thresh, dirty); + if (limit > thresh) { + limit -= (limit - thresh) >> 5; + goto update; + } + return; +update: + global_dirty_limit = limit; +} + +static void global_update_bandwidth(unsigned long thresh, + unsigned long dirty, + unsigned long now) +{ + static DEFINE_SPINLOCK(dirty_lock); + static unsigned long update_time; + + /* + * check locklessly first to optimize away locking for the most time + */ + if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + return; + + spin_lock(&dirty_lock); + if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { + update_dirty_limit(thresh, dirty); + update_time = now; + } + spin_unlock(&dirty_lock); +} + +void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long written; + + /* + * rate-limit, only update once every 200ms. + */ + if (elapsed < BANDWIDTH_INTERVAL) + return; + + written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + + /* + * Skip quiet periods when disk bandwidth is under-utilized. + * (at least 1s idle time between two flusher runs) + */ + if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) + goto snapshot; + + if (thresh) + global_update_bandwidth(thresh, dirty, now); + + bdi_update_write_bandwidth(bdi, elapsed, written); + +snapshot: + bdi->written_stamp = written; + bdi->bw_time_stamp = now; +} + +static void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time) +{ + if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) + return; + spin_lock(&bdi->wb.list_lock); + __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, + start_time); + spin_unlock(&bdi->wb.list_lock); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) static void balance_dirty_pages(struct address_space *mapping, unsigned long write_chunk) { - long nr_reclaimable, bdi_nr_reclaimable; - long nr_writeback, bdi_nr_writeback; + unsigned long nr_reclaimable, bdi_nr_reclaimable; + unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ + unsigned long bdi_dirty; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + unsigned long task_bdi_thresh; + unsigned long min_task_bdi_thresh; unsigned long pages_written = 0; unsigned long pause = 1; bool dirty_exceeded = false; + bool clear_dirty_exceeded = true; struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long start_time = jiffies; for (;;) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = write_chunk, - .range_cyclic = 1, - }; - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - nr_writeback = global_page_state(NR_WRITEBACK); + nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); global_dirty_limits(&background_thresh, &dirty_thresh); @@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping, * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ - if (nr_reclaimable + nr_writeback <= - (background_thresh + dirty_thresh) / 2) + if (nr_dirty <= (background_thresh + dirty_thresh) / 2) break; bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - bdi_thresh = task_dirty_limit(current, bdi_thresh); + min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); + task_bdi_thresh = task_dirty_limit(current, bdi_thresh); /* * In order to avoid the stacked BDI deadlock we need @@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping, * actually dirty; with m+n sitting in the percpu * deltas. */ - if (bdi_thresh < 2*bdi_stat_error(bdi)) { + if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); + bdi_dirty = bdi_nr_reclaimable + + bdi_stat_sum(bdi, BDI_WRITEBACK); } else { bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + bdi_dirty = bdi_nr_reclaimable + + bdi_stat(bdi, BDI_WRITEBACK); } /* @@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping, * bdi or process from holding back light ones; The latter is * the last resort safeguard. */ - dirty_exceeded = - (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) - || (nr_reclaimable + nr_writeback > dirty_thresh); + dirty_exceeded = (bdi_dirty > task_bdi_thresh) || + (nr_dirty > dirty_thresh); + clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && + (nr_dirty <= dirty_thresh); if (!dirty_exceeded) break; @@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping, if (!bdi->dirty_exceeded) bdi->dirty_exceeded = 1; + bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, + bdi_thresh, bdi_dirty, start_time); + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been @@ -557,17 +735,40 @@ static void balance_dirty_pages(struct address_space *mapping, * threshold otherwise wait until the disk writes catch * up. */ - trace_wbc_balance_dirty_start(&wbc, bdi); - if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes_wb(&bdi->wb, &wbc); - pages_written += write_chunk - wbc.nr_to_write; - trace_wbc_balance_dirty_written(&wbc, bdi); + trace_balance_dirty_start(bdi); + if (bdi_nr_reclaimable > task_bdi_thresh) { + pages_written += writeback_inodes_wb(&bdi->wb, + write_chunk); + trace_balance_dirty_written(bdi, pages_written); if (pages_written >= write_chunk) break; /* We've done our duty */ } - trace_wbc_balance_dirty_wait(&wbc, bdi); __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); + trace_balance_dirty_wait(bdi); + + dirty_thresh = hard_dirty_limit(dirty_thresh); + /* + * max-pause area. If dirty exceeded but still within this + * area, no need to sleep for more than 200ms: (a) 8 pages per + * 200ms is typically more than enough to curb heavy dirtiers; + * (b) the pause time limit makes the dirtiers more responsive. + */ + if (nr_dirty < dirty_thresh + + dirty_thresh / DIRTY_MAXPAUSE_AREA && + time_after(jiffies, start_time + MAX_PAUSE)) + break; + /* + * pass-good area. When some bdi gets blocked (eg. NFS server + * not responding), or write bandwidth dropped dramatically due + * to concurrent reads, or dirty threshold suddenly dropped and + * the dirty pages cannot be brought down anytime soon (eg. on + * slow USB stick), at least let go of the good bdi's. + */ + if (nr_dirty < dirty_thresh + + dirty_thresh / DIRTY_PASSGOOD_AREA && + bdi_dirty < bdi_thresh) + break; /* * Increase the delay for each loop, up to our previous @@ -578,7 +779,8 @@ static void balance_dirty_pages(struct address_space *mapping, pause = HZ / 10; } - if (!dirty_exceeded && bdi->dirty_exceeded) + /* Clear dirty_exceeded flag only when no task can exceed the limit */ + if (clear_dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) @@ -626,9 +828,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { + struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long ratelimit; unsigned long *p; + if (!bdi_cap_account_dirty(bdi)) + return; + ratelimit = ratelimit_pages; if (mapping->backing_dev_info->dirty_exceeded) ratelimit = 8; @@ -892,12 +1098,12 @@ int write_cache_pages(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { @@ -927,7 +1133,7 @@ retry: break; } - done_index = page->index + 1; + done_index = page->index; lock_page(page); @@ -977,6 +1183,7 @@ continue_unlock: * not be suitable for data integrity * writeout). */ + done_index = page->index + 1; done = 1; break; } @@ -1039,11 +1246,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc, int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct blk_plug plug; + int ret; + /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; - return write_cache_pages(mapping, wbc, __writepage, mapping); + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + return ret; } EXPORT_SYMBOL(generic_writepages); @@ -1134,7 +1347,6 @@ EXPORT_SYMBOL(account_page_dirtied); void account_page_writeback(struct page *page) { inc_zone_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_WRITTEN); } EXPORT_SYMBOL(account_page_writeback); @@ -1211,6 +1423,17 @@ int set_page_dirty(struct page *page) if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; + /* + * readahead/lru_deactivate_page could remain + * PG_readahead/PG_reclaim due to race with end_page_writeback + * About readahead, if the page is written, the flags would be + * reset. So no problem. + * About lru_deactivate_page, if the page is redirty, the flag + * will be reset. So no problem. but if the page is used by readahead + * it will confuse readahead and make it restart the size rampup + * process. But it's a trivial problem. + */ + ClearPageReclaim(page); #ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; @@ -1239,7 +1462,7 @@ int set_page_dirty_lock(struct page *page) { int ret; - lock_page_nosync(page); + lock_page(page); ret = set_page_dirty(page); unlock_page(page); return ret; @@ -1266,7 +1489,6 @@ int clear_page_dirty_for_io(struct page *page) BUG_ON(!PageLocked(page)); - ClearPageReclaim(page); if (mapping && mapping_cap_account_dirty(mapping)) { /* * Yes, Virginia, this is indeed insane. @@ -1341,8 +1563,10 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } - if (ret) + if (ret) { dec_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); + } return ret; } @@ -1388,10 +1612,6 @@ EXPORT_SYMBOL(test_set_page_writeback); */ int mapping_tagged(struct address_space *mapping, int tag) { - int ret; - rcu_read_lock(); - ret = radix_tree_tagged(&mapping->page_tree, tag); - rcu_read_unlock(); - return ret; + return radix_tree_tagged(&mapping->page_tree, tag); } EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cdef1d4b4e4..6e8ecb6e021 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -30,6 +30,7 @@ #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/ratelimit.h> #include <linux/oom.h> #include <linux/notifier.h> #include <linux/topology.h> @@ -39,6 +40,7 @@ #include <linux/memory_hotplug.h> #include <linux/nodemask.h> #include <linux/vmalloc.h> +#include <linux/vmstat.h> #include <linux/mempolicy.h> #include <linux/stop_machine.h> #include <linux/sort.h> @@ -53,6 +55,8 @@ #include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> +#include <linux/memcontrol.h> +#include <linux/prefetch.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -286,7 +290,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - __ClearPageBuddy(page); + reset_page_mapcount(page); /* remove PageBuddy */ return; } @@ -317,7 +321,7 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - __ClearPageBuddy(page); + reset_page_mapcount(page); /* remove PageBuddy */ add_taint(TAINT_BAD_PAGE); } @@ -565,7 +569,8 @@ static inline int free_pages_check(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { + (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -614,6 +619,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, list = &pcp->lists[migratetype]; } while (list_empty(list)); + /* This is the only non-empty list. Free them all. */ + if (batch_free == MIGRATE_PCPTYPES) + batch_free = to_free; + do { page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ @@ -750,7 +759,8 @@ static inline int check_new_page(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { + (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -863,9 +873,8 @@ static int move_freepages(struct zone *zone, } order = page_order(page); - list_del(&page->lru); - list_add(&page->lru, - &zone->free_area[order].free_list[migratetype]); + list_move(&page->lru, + &zone->free_area[order].free_list[migratetype]); page += 1 << order; pages_moved += 1 << order; } @@ -936,7 +945,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * If breaking a large block of pages, move all free * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more - * agressive about taking ownership of free pages + * aggressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || start_migratetype == MIGRATE_RECLAIMABLE || @@ -1333,7 +1342,7 @@ again: } __count_zone_vm_events(PGALLOC, zone, 1 << order); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); VM_BUG_ON(bad_range(zone, page)); @@ -1361,21 +1370,12 @@ failed: #ifdef CONFIG_FAIL_PAGE_ALLOC -static struct fail_page_alloc_attr { +static struct { struct fault_attr attr; u32 ignore_gfp_highmem; u32 ignore_gfp_wait; u32 min_order; - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - - struct dentry *ignore_gfp_highmem_file; - struct dentry *ignore_gfp_wait_file; - struct dentry *min_order_file; - -#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ - } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, @@ -1409,36 +1409,27 @@ static int __init fail_page_alloc_debugfs(void) { mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; - int err; - - err = init_fault_attr_dentries(&fail_page_alloc.attr, - "fail_page_alloc"); - if (err) - return err; - dir = fail_page_alloc.attr.dentries.dir; - - fail_page_alloc.ignore_gfp_wait_file = - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_wait); - - fail_page_alloc.ignore_gfp_highmem_file = - debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem); - fail_page_alloc.min_order_file = - debugfs_create_u32("min-order", mode, dir, - &fail_page_alloc.min_order); - - if (!fail_page_alloc.ignore_gfp_wait_file || - !fail_page_alloc.ignore_gfp_highmem_file || - !fail_page_alloc.min_order_file) { - err = -ENOMEM; - debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); - debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); - debugfs_remove(fail_page_alloc.min_order_file); - cleanup_fault_attr_dentries(&fail_page_alloc.attr); - } - return err; + dir = fault_create_debugfs_attr("fail_page_alloc", NULL, + &fail_page_alloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_wait)) + goto fail; + if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem)) + goto fail; + if (!debugfs_create_u32("min-order", mode, dir, + &fail_page_alloc.min_order)) + goto fail; + + return 0; +fail: + debugfs_remove_recursive(dir); + + return -ENOMEM; } late_initcall(fail_page_alloc_debugfs); @@ -1607,6 +1598,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) set_bit(i, zlc->fullzones); } +/* + * clear all zones full, called after direct reclaim makes progress so that + * a zone that was recently full is not skipped over for up to a second + */ +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); +} + #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1623,6 +1629,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) { } + +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ +} #endif /* CONFIG_NUMA */ /* @@ -1655,7 +1665,7 @@ zonelist_scan: continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) - goto try_next_zone; + continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { @@ -1667,17 +1677,36 @@ zonelist_scan: classzone_idx, alloc_flags)) goto try_this_zone; + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + /* + * we do zlc_setup if there are multiple nodes + * and before considering the first zone allowed + * by the cpuset. + */ + allowednodes = zlc_setup(zonelist, alloc_flags); + zlc_active = 1; + did_zlc_setup = 1; + } + if (zone_reclaim_mode == 0) goto this_zone_full; + /* + * As we may have just activated ZLC, check if the first + * eligible zone has failed zone_reclaim recently. + */ + if (NUMA_BUILD && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; + ret = zone_reclaim(zone, gfp_mask, order); switch (ret) { case ZONE_RECLAIM_NOSCAN: /* did not scan */ - goto try_next_zone; + continue; case ZONE_RECLAIM_FULL: /* scanned but unreclaimable */ - goto this_zone_full; + continue; default: /* did we reclaim enough */ if (!zone_watermark_ok(zone, order, mark, @@ -1694,16 +1723,6 @@ try_this_zone: this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); -try_next_zone: - if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup after the first zone is tried but only - * if there are multiple nodes make it worthwhile - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } } if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { @@ -1714,6 +1733,59 @@ try_next_zone: return page; } +/* + * Large machines with many possible nodes should not always dump per-node + * meminfo in irq context. + */ +static inline bool should_suppress_show_mem(void) +{ + bool ret = false; + +#if NODES_SHIFT > 8 + ret = in_interrupt(); +#endif + return ret; +} + +static DEFINE_RATELIMIT_STATE(nopage_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +{ + va_list args; + unsigned int filter = SHOW_MEM_FILTER_NODES; + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + return; + + /* + * This documents exceptions given to allocations in certain + * contexts that are allowed to allocate outside current's set + * of allowed nodes. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (test_thread_flag(TIF_MEMDIE) || + (current->flags & (PF_MEMALLOC | PF_EXITING))) + filter &= ~SHOW_MEM_FILTER_NODES; + if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + filter &= ~SHOW_MEM_FILTER_NODES; + + if (fmt) { + printk(KERN_WARNING); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + } + + pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", + current->comm, order, gfp_mask); + + dump_stack(); + if (!should_suppress_show_mem()) + show_mem(filter); +} + static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed) @@ -1892,6 +1964,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (unlikely(!(*did_some_progress))) return NULL; + /* After successful reclaim, reconsider all zones for allocation */ + if (NUMA_BUILD) + zlc_clear_zones_full(zonelist); + retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, @@ -2044,6 +2120,7 @@ restart: first_zones_zonelist(zonelist, high_zoneidx, NULL, &preferred_zone); +rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2051,7 +2128,6 @@ restart: if (page) goto got_pg; -rebalance: /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { page = __alloc_pages_high_priority(gfp_mask, order, @@ -2156,13 +2232,7 @@ rebalance: } nopage: - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { - printk(KERN_WARNING "%s: page allocation failure." - " order:%d, mode:0x%x\n", - current->comm, order, gfp_mask); - dump_stack(); - show_mem(); - } + warn_alloc_failed(gfp_mask, order, NULL); return page; got_pg: if (kmemcheck_enabled) @@ -2283,6 +2353,21 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +{ + if (addr) { + unsigned long alloc_end = addr + (PAGE_SIZE << order); + unsigned long used = addr + PAGE_ALIGN(size); + + split_page(virt_to_page((void *)addr), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } + return (void *)addr; +} + /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate @@ -2302,22 +2387,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long addr; addr = __get_free_pages(gfp_mask, order); - if (addr) { - unsigned long alloc_end = addr + (PAGE_SIZE << order); - unsigned long used = addr + PAGE_ALIGN(size); - - split_page(virt_to_page((void *)addr), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } - } - - return (void *)addr; + return make_alloc_exact(addr, order, size); } EXPORT_SYMBOL(alloc_pages_exact); /** + * alloc_pages_exact_nid - allocate an exact number of physically-contiguous + * pages on a node. + * @nid: the preferred node ID where memory should be allocated + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * Like alloc_pages_exact(), but try to allocate on node nid first before falling + * back. + * Note this is not alloc_pages_exact_node() which allocates on a specific node, + * but is not exact. + */ +void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +{ + unsigned order = get_order(size); + struct page *p = alloc_pages_node(nid, gfp_mask, order); + if (!p) + return NULL; + return make_alloc_exact((unsigned long)page_address(p), order, size); +} +EXPORT_SYMBOL(alloc_pages_exact_nid); + +/** * free_pages_exact - release memory allocated via alloc_pages_exact() * @virt: the value returned by alloc_pages_exact. * @size: size of allocation, same value as passed to alloc_pages_exact(). @@ -2411,19 +2507,41 @@ void si_meminfo_node(struct sysinfo *val, int nid) } #endif +/* + * Determine whether the node should be displayed or not, depending on whether + * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). + */ +bool skip_free_areas_node(unsigned int flags, int nid) +{ + bool ret = false; + + if (!(flags & SHOW_MEM_FILTER_NODES)) + goto out; + + get_mems_allowed(); + ret = !node_isset(nid, cpuset_current_mems_allowed); + put_mems_allowed(); +out: + return ret; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. + * Suppresses nodes that are not allowed by current's cpuset if + * SHOW_MEM_FILTER_NODES is passed. */ -void show_free_areas(void) +void show_free_areas(unsigned int filter) { int cpu; struct zone *zone; for_each_populated_zone(zone) { + if (skip_free_areas_node(filter, zone_to_nid(zone))) + continue; show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -2465,6 +2583,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { int i; + if (skip_free_areas_node(filter, zone_to_nid(zone))) + continue; show_node(zone); printk("%s" " free:%lukB" @@ -2532,6 +2652,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + if (skip_free_areas_node(filter, zone_to_nid(zone))) + continue; show_node(zone); printk("%s: ", zone->name); @@ -3110,7 +3232,7 @@ static __init_refok int __build_all_zonelists(void *data) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. */ -void build_all_zonelists(void *data) +void __ref build_all_zonelists(void *data) { set_zonelist_order(); @@ -3221,6 +3343,20 @@ static inline unsigned long wait_table_bits(unsigned long size) #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) /* + * Check if a pageblock contains reserved pages + */ +static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) + return 1; + } + return 0; +} + +/* * Mark a number of pageblocks as MIGRATE_RESERVE. The number * of blocks reserved is based on min_wmark_pages(zone). The memory within * the reserve will tend to store contiguous free pages. Setting min_free_kbytes @@ -3229,7 +3365,7 @@ static inline unsigned long wait_table_bits(unsigned long size) */ static void setup_zone_migrate_reserve(struct zone *zone) { - unsigned long start_pfn, pfn, end_pfn; + unsigned long start_pfn, pfn, end_pfn, block_end_pfn; struct page *page; unsigned long block_migratetype; int reserve; @@ -3259,7 +3395,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) continue; /* Blocks with reserved pages will never free, skip them. */ - if (PageReserved(page)) + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) continue; block_migratetype = get_pageblock_migratetype(page); @@ -3448,7 +3585,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } -static __meminit void setup_zone_pageset(struct zone *zone) +static void setup_zone_pageset(struct zone *zone) { int cpu; @@ -3498,7 +3635,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, alloc_size); + alloc_bootmem_node_nopanic(pgdat, alloc_size); } else { /* * This case means that a zone whose size was 0 gets new memory @@ -3699,13 +3836,45 @@ void __init free_bootmem_with_active_regions(int nid, } #ifdef CONFIG_HAVE_MEMBLOCK +/* + * Basic iterator support. Return the last range of PFNs for a node + * Note: nid == MAX_NUMNODES returns last region regardless of node + */ +static int __meminit last_active_region_index_in_nid(int nid) +{ + int i; + + for (i = nr_nodemap_entries - 1; i >= 0; i--) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the previous active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit previous_active_region_index_in_nid(int index, int nid) +{ + for (index = index - 1; index >= 0; index--) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#define for_each_active_range_index_in_nid_reverse(i, nid) \ + for (i = last_active_region_index_in_nid(nid); i != -1; \ + i = previous_active_region_index_in_nid(i, nid)) + u64 __init find_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { int i; /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { + for_each_active_range_index_in_nid_reverse(i, nid) { u64 addr; u64 ei_start, ei_last; u64 final_start, final_end; @@ -3748,34 +3917,6 @@ int __init add_from_early_node_map(struct range *range, int az, return nr_range; } -#ifdef CONFIG_NO_BOOTMEM -void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - void *ptr; - u64 addr; - - if (limit > memblock.current_limit) - limit = memblock.current_limit; - - addr = find_memory_core_early(nid, size, align, goal, limit); - - if (addr == MEMBLOCK_ERROR) - return NULL; - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; -} -#endif - - void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -3856,7 +3997,7 @@ static void __init find_usable_zone_for_movable(void) /* * The zone ranges provided by the architecture do not include ZONE_MOVABLE - * because it is sized independant of architecture. Unlike the other zones, + * because it is sized independent of architecture. Unlike the other zones, * the starting point for ZONE_MOVABLE is not fixed. It may be different * in each node depending on the size of each node and how evenly kernelcore * is distributed. This helper function adjusts the zone ranges @@ -4071,7 +4212,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, unsigned long usemapsize = usemap_size(zonesize); zone->pageblock_flags = NULL; if (usemapsize) - zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); + zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, + usemapsize); } #else static inline void setup_usemap(struct pglist_data *pgdat, @@ -4191,10 +4333,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) { + for_each_lru(l) INIT_LIST_HEAD(&zone->lru[l].list); - zone->reclaim_stat.nr_saved_scan[l] = 0; - } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; @@ -4237,7 +4377,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) - map = alloc_bootmem_node(pgdat, size); + map = alloc_bootmem_node_nopanic(pgdat, size); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -4459,6 +4599,60 @@ void __init sort_node_map(void) cmp_node_active_region, NULL); } +/** + * node_map_pfn_alignment - determine the maximum internode alignment + * + * This function should be called after node map is populated and sorted. + * It calculates the maximum power of two alignment which can distinguish + * all the nodes. + * + * For example, if all nodes are 1GiB and aligned to 1GiB, the return value + * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the + * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is + * shifted, 1GiB is enough and this function will indicate so. + * + * This is used to test whether pfn -> nid mapping of the chosen memory + * model has fine enough granularity to avoid incorrect mapping for the + * populated node map. + * + * Returns the determined alignment in pfn's. 0 if there is no alignment + * requirement (single node). + */ +unsigned long __init node_map_pfn_alignment(void) +{ + unsigned long accl_mask = 0, last_end = 0; + int last_nid = -1; + int i; + + for_each_active_range_index_in_nid(i, MAX_NUMNODES) { + int nid = early_node_map[i].nid; + unsigned long start = early_node_map[i].start_pfn; + unsigned long end = early_node_map[i].end_pfn; + unsigned long mask; + + if (!start || last_nid < 0 || last_nid == nid) { + last_nid = nid; + last_end = end; + continue; + } + + /* + * Start with a mask granular enough to pin-point to the + * start pfn and tick off bits one-by-one until it becomes + * too coarse to separate the current node from the last. + */ + mask = ~((1 << __ffs(start)) - 1); + while (mask && last_end <= (start & (mask << 1))) + mask <<= 1; + + /* accumulate all internode masks */ + accl_mask |= mask; + } + + /* convert mask to number of pages */ + return ~accl_mask + 1; +} + /* Find the lowest pfn for a node */ static unsigned long __init find_min_pfn_for_node(int nid) { @@ -4809,15 +5003,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { -#ifndef CONFIG_NO_BOOTMEM - .bdata = &bootmem_node_data[0] -#endif - }; -EXPORT_SYMBOL(contig_page_data); -#endif - void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, @@ -5011,7 +5196,7 @@ void setup_per_zone_wmarks(void) * 1TB 101 10GB * 10TB 320 32GB */ -void calculate_zone_inactive_ratio(struct zone *zone) +static void __meminit calculate_zone_inactive_ratio(struct zone *zone) { unsigned int gb, ratio; @@ -5025,7 +5210,7 @@ void calculate_zone_inactive_ratio(struct zone *zone) zone->inactive_ratio = ratio; } -static void __init setup_per_zone_inactive_ratio(void) +static void __meminit setup_per_zone_inactive_ratio(void) { struct zone *zone; @@ -5057,7 +5242,7 @@ static void __init setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_wmark_min(void) +int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; @@ -5069,6 +5254,7 @@ static int __init init_per_zone_wmark_min(void) if (min_free_kbytes > 65536) min_free_kbytes = 65536; setup_per_zone_wmarks(); + refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; @@ -5419,10 +5605,8 @@ int set_migratetype_isolate(struct page *page) struct memory_isolate_notify arg; int notifier_ret; int ret = -EBUSY; - int zone_idx; zone = page_zone(page); - zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); @@ -5626,4 +5810,5 @@ void dump_page(struct page *page) page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); + mem_cgroup_print_bad_page(page); } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5bffada7cde..39d216d535e 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,12 +11,11 @@ #include <linux/swapops.h> #include <linux/kmemleak.h> -static void __meminit -__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) +static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) { pc->flags = 0; + set_page_cgroup_array_id(pc, id); pc->mem_cgroup = NULL; - pc->page = pfn_to_page(pfn); INIT_LIST_HEAD(&pc->lru); } static unsigned long total_usage; @@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return base + offset; } +struct page *lookup_cgroup_page(struct page_cgroup *pc) +{ + unsigned long pfn; + struct page *page; + pg_data_t *pgdat; + + pgdat = NODE_DATA(page_cgroup_array_id(pc)); + pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; + page = pfn_to_page(pfn); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + static int __init alloc_node_page_cgroup(int nid) { struct page_cgroup *base, *pc; @@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid) return -ENOMEM; for (index = 0; index < nr_pages; index++) { pc = base + index; - __init_page_cgroup(pc, start_pfn + index); + init_page_cgroup(pc, nid); } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; @@ -105,46 +117,74 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return section->page_cgroup + pfn; } -/* __alloc_bootmem...() is protected by !slab_available() */ -static int __init_refok init_section_page_cgroup(unsigned long pfn) +struct page *lookup_cgroup_page(struct page_cgroup *pc) { - struct mem_section *section = __pfn_to_section(pfn); - struct page_cgroup *base, *pc; - unsigned long table_size; - int nid, index; - - if (!section->page_cgroup) { - nid = page_to_nid(pfn_to_page(pfn)); - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - VM_BUG_ON(!slab_is_available()); - if (node_state(nid, N_HIGH_MEMORY)) { - base = kmalloc_node(table_size, - GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); - if (!base) - base = vmalloc(table_size); - } - /* - * The value stored in section->page_cgroup is (base - pfn) - * and it does not point to the memory block allocated above, - * causing kmemleak false positives. - */ - kmemleak_not_leak(base); + struct mem_section *section; + struct page *page; + unsigned long nr; + + nr = page_cgroup_array_id(pc); + section = __nr_to_section(nr); + page = pfn_to_page(pc - section->page_cgroup); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + +static void *__meminit alloc_page_cgroup(size_t size, int nid) +{ + void *addr = NULL; + + addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); + if (addr) + return addr; + + if (node_state(nid, N_HIGH_MEMORY)) + addr = vmalloc_node(size, nid); + else + addr = vmalloc(size); + + return addr; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_cgroup(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); } else { - /* - * We don't have to allocate page_cgroup again, but - * address of memmap may be changed. So, we have to initialize - * again. - */ - base = section->page_cgroup + pfn; - table_size = 0; - /* check address of memmap is changed or not. */ - if (base->page == pfn_to_page(pfn)) - return 0; + struct page *page = virt_to_page(addr); + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); } +} +#endif + +static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) +{ + struct page_cgroup *base, *pc; + struct mem_section *section; + unsigned long table_size; + unsigned long nr; + int index; + + nr = pfn_to_section_nr(pfn); + section = __nr_to_section(nr); + + if (section->page_cgroup) + return 0; + + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + base = alloc_page_cgroup(table_size, nid); + + /* + * The value stored in section->page_cgroup is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); if (!base) { printk(KERN_ERR "page cgroup allocation failure\n"); @@ -153,9 +193,13 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) for (index = 0; index < PAGES_PER_SECTION; index++) { pc = base + index; - __init_page_cgroup(pc, pfn + index); + init_page_cgroup(pc, nr); } - + /* + * The passed "pfn" may not be aligned to SECTION. For the calculation + * we need to apply a mask. + */ + pfn &= PAGE_SECTION_MASK; section->page_cgroup = base - pfn; total_usage += table_size; return 0; @@ -170,16 +214,8 @@ void __free_page_cgroup(unsigned long pfn) if (!ms || !ms->page_cgroup) return; base = ms->page_cgroup + pfn; - if (is_vmalloc_addr(base)) { - vfree(base); - ms->page_cgroup = NULL; - } else { - struct page *page = virt_to_page(base); - if (!PageReserved(page)) { /* Is bootmem ? */ - kfree(base); - ms->page_cgroup = NULL; - } - } + free_page_cgroup(base); + ms->page_cgroup = NULL; } int __meminit online_page_cgroup(unsigned long start_pfn, @@ -189,13 +225,23 @@ int __meminit online_page_cgroup(unsigned long start_pfn, unsigned long start, end, pfn; int fail = 0; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + if (nid == -1) { + /* + * In this case, "nid" already exists and contains valid memory. + * "start_pfn" passed to us is a pfn which is an arg for + * online__pages(), and start_pfn should exist. + */ + nid = pfn_to_nid(start_pfn); + VM_BUG_ON(!node_state(nid, N_ONLINE)); + } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { if (!pfn_present(pfn)) continue; - fail = init_section_page_cgroup(pfn); + fail = init_section_page_cgroup(pfn, nid); } if (!fail) return 0; @@ -212,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn, { unsigned long start, end, pfn; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_cgroup(pfn); @@ -243,12 +289,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, break; } - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } #endif @@ -256,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, void __init page_cgroup_init(void) { unsigned long pfn; - int fail = 0; + int nid; if (mem_cgroup_disabled()) return; - for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { - if (!pfn_present(pfn)) - continue; - fail = init_section_page_cgroup(pfn); - } - if (fail) { - printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); - panic("Out of memory"); - } else { - hotplug_memory_notifier(page_cgroup_callback, 0); + for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long start_pfn, end_pfn; + + start_pfn = node_start_pfn(nid); + end_pfn = node_end_pfn(nid); + /* + * start_pfn and end_pfn may not be aligned to SECTION and the + * page->flags of out of node pages are not initialized. So we + * scan [start_pfn, the biggest section's pfn < end_pfn) here. + */ + for (pfn = start_pfn; + pfn < end_pfn; + pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { + + if (!pfn_valid(pfn)) + continue; + /* + * Nodes's pfns can be overlapping. + * We know some arch can have a nodes layout such as + * -------------pfn--------------> + * N0 | N1 | N2 | N0 | N1 | N2|.... + */ + if (pfn_to_nid(pfn) != nid) + continue; + if (init_section_page_cgroup(pfn, nid)) + goto oom; + } } + hotplug_memory_notifier(page_cgroup_callback, 0); printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" - " want memory cgroups\n"); + printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " + "don't want memory cgroups\n"); + return; +oom: + printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); + panic("Out of memory"); } void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) @@ -349,7 +412,7 @@ not_enough_page: * @new: new id * * Returns old id at success, 0 at failure. - * (There is no mem_cgroup useing 0 as its id) + * (There is no mem_cgroup using 0 as its id) */ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) @@ -447,7 +510,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) if (!do_swap_account) return 0; - length = ((max_pages/SC_PER_PAGE) + 1); + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); array = vmalloc(array_size); @@ -464,8 +527,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) /* memory shortage */ ctrl->map = NULL; ctrl->length = 0; - vfree(array); mutex_unlock(&swap_cgroup_mutex); + vfree(array); goto nomem; } mutex_unlock(&swap_cgroup_mutex); @@ -474,13 +537,14 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) nomem: printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); printk(KERN_INFO - "swap_cgroup can be disabled by noswapaccount boot option\n"); + "swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } void swap_cgroup_swapoff(int type) { - int i; + struct page **map; + unsigned long i, length; struct swap_cgroup_ctrl *ctrl; if (!do_swap_account) @@ -488,17 +552,20 @@ void swap_cgroup_swapoff(int type) mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; - if (ctrl->map) { - for (i = 0; i < ctrl->length; i++) { - struct page *page = ctrl->map[i]; + map = ctrl->map; + length = ctrl->length; + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + + if (map) { + for (i = 0; i < length; i++) { + struct page *page = map[i]; if (page) __free_page(page); } - vfree(ctrl->map); - ctrl->map = NULL; - ctrl->length = 0; + vfree(map); } - mutex_unlock(&swap_cgroup_mutex); } #endif diff --git a/mm/page_io.c b/mm/page_io.c index 2dee975bf46..dc76b4d0611 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 7cfa6ae0230..2f5cf10ff66 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -33,19 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_offset(pud, addr); do { +again: next = pmd_addr_end(addr, end); - split_huge_page_pmd(walk->mm, pmd); - if (pmd_none_or_clear_bad(pmd)) { + if (pmd_none(*pmd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; continue; } + /* + * This implies that each ->pmd_entry() handler + * needs to know about pmd_trans_huge() pmds + */ if (walk->pmd_entry) err = walk->pmd_entry(pmd, addr, next, walk); - if (!err && walk->pte_entry) - err = walk_pte_range(pmd, addr, next, walk); + if (err) + break; + + /* + * Check this here so we only break down trans_huge + * pages when we _need_ to + */ + if (!walk->pte_entry) + continue; + + split_huge_page_pmd(walk->mm, pmd); + if (pmd_none_or_clear_bad(pmd)) + goto again; + err = walk_pte_range(pmd, addr, next, walk); if (err) break; } while (pmd++, addr = next, addr != end); @@ -110,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, return 0; } -#endif + +static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +{ + struct vm_area_struct *vma; + + /* We don't need vma lookup at all. */ + if (!walk->hugetlb_entry) + return NULL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + vma = find_vma(walk->mm, addr); + if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) + return vma; + + return NULL; +} + +#else /* CONFIG_HUGETLB_PAGE */ +static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +{ + return NULL; +} + +static int walk_hugetlb_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + return 0; +} + +#endif /* CONFIG_HUGETLB_PAGE */ + + /** * walk_page_range - walk a memory map's page tables with a callback @@ -128,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, * associated range, and a copy of the original mm_walk for access to * the ->private or ->mm fields. * - * No locks are taken, but the bottom level iterator will map PTE + * Usually no locks are taken, but splitting transparent huge page may + * take page table lock. And the bottom level iterator will map PTE * directories from highmem if necessary. * * If any callback returns a non-zero value, the walk is aborted and * the return value is propagated back to the caller. Otherwise 0 is returned. + * + * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry + * is !NULL. */ int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -149,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *uninitialized_var(vma); + struct vm_area_struct *vma; next = pgd_addr_end(addr, end); -#ifdef CONFIG_HUGETLB_PAGE /* * handle hugetlb vma individually because pagetable walk for * the hugetlb page is dependent on the architecture and * we can't handled it in the same manner as non-huge pages. */ - vma = find_vma(walk->mm, addr); - if (vma && is_vm_hugetlb_page(vma)) { + vma = hugetlb_vma(addr, walk); + if (vma) { if (vma->vm_end < next) next = vma->vm_end; /* @@ -173,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, next); continue; } -#endif + if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); diff --git a/mm/percpu.c b/mm/percpu.c index 3f930018aa6..bf80e55dbed 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -342,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * @chunk: chunk of interest * * Determine whether area map of @chunk needs to be extended to - * accomodate a new allocation. + * accommodate a new allocation. * * CONTEXT: * pcpu_lock. @@ -431,7 +431,7 @@ out_unlock: * depending on @head, is reduced by @tail bytes and @tail byte block * is inserted after the target block. * - * @chunk->map must have enough free slots to accomodate the split. + * @chunk->map must have enough free slots to accommodate the split. * * CONTEXT: * pcpu_lock. @@ -1008,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) } if (in_first_chunk) { - if ((unsigned long)addr < VMALLOC_START || - (unsigned long)addr >= VMALLOC_END) + if (!is_vmalloc_addr(addr)) return __pa(addr); else return page_to_phys(vmalloc_to_page(addr)); @@ -1216,8 +1215,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); #endif PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); @@ -1436,7 +1437,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest - * which can accomodate 4k aligned segments which are equal to + * which can accommodate 4k aligned segments which are equal to * or larger than min_unit_size. */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); @@ -1551,7 +1552,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @alloc_fn: function to allocate percpu page - * @free_fn: funtion to free percpu page + * @free_fn: function to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. @@ -1646,8 +1647,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " - "space 0x%lx\n", - max_distance, VMALLOC_END - VMALLOC_START); + "space 0x%lx\n", max_distance, + (unsigned long)(VMALLOC_END - VMALLOC_START)); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; @@ -1679,7 +1680,7 @@ out_free: * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE - * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @free_fn: function to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * * This is a helper to ease setting up page-remapped first percpu diff --git a/mm/prio_tree.c b/mm/prio_tree.c index 603ae98d969..799dcfd7cd8 100644 --- a/mm/prio_tree.c +++ b/mm/prio_tree.c @@ -13,6 +13,7 @@ #include <linux/mm.h> #include <linux/prio_tree.h> +#include <linux/prefetch.h> /* * See lib/prio_tree.c for details on the general radix priority search tree diff --git a/mm/readahead.c b/mm/readahead.c index 77506a291a2..867f9dd82dc 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages); static int read_pages(struct address_space *mapping, struct file *filp, struct list_head *pages, unsigned nr_pages) { + struct blk_plug plug; unsigned page_idx; int ret; + blk_start_plug(&plug); + if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); /* Clean up the remaining pages */ @@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp, page_cache_release(page); } ret = 0; + out: + blk_finish_plug(&plug); + return ret; } @@ -174,7 +180,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (page) continue; - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc_readahead(mapping); if (!page) break; page->index = page_offset; @@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping, /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); - -#ifdef CONFIG_BLOCK - /* - * Normally the current page is !uptodate and lock_page() will be - * immediately called to implicitly unplug the device. However this - * is not always true for RAID conifgurations, where data arrives - * not strictly in their submission order. In this case we need to - * explicitly kick off the IO. - */ - if (PageUptodate(page)) - blk_run_backing_dev(mapping->backing_dev_info, NULL); -#endif } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index f21f4a1d6a1..8005080fb9e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -21,25 +21,24 @@ * Lock ordering in mm: * * inode->i_mutex (while writing or truncating, not reading or faulting) - * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock - * anon_vma->lock + * mapping->i_mmap_mutex + * anon_vma->mutex * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) - * inode_lock (in set_page_dirty's __mark_inode_dirty) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, - * within inode_lock in __sync_single_inode) + * within bdi.wb->list_lock in __sync_single_inode) * - * (code doesn't rely on that order so it could be switched around) - * ->tasklist_lock - * anon_vma->lock (memory_failure, collect_procs_anon) + * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) + * ->tasklist_lock * pte map lock */ @@ -67,17 +66,53 @@ static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { - return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); + struct anon_vma *anon_vma; + + anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); + if (anon_vma) { + atomic_set(&anon_vma->refcount, 1); + /* + * Initialise the anon_vma root to point to itself. If called + * from fork, the root will be reset to the parents anon_vma. + */ + anon_vma->root = anon_vma; + } + + return anon_vma; } -void anon_vma_free(struct anon_vma *anon_vma) +static inline void anon_vma_free(struct anon_vma *anon_vma) { + VM_BUG_ON(atomic_read(&anon_vma->refcount)); + + /* + * Synchronize against page_lock_anon_vma() such that + * we can safely hold the lock without the anon_vma getting + * freed. + * + * Relies on the full mb implied by the atomic_dec_and_test() from + * put_anon_vma() against the acquire barrier implied by + * mutex_trylock() from page_lock_anon_vma(). This orders: + * + * page_lock_anon_vma() VS put_anon_vma() + * mutex_trylock() atomic_dec_and_test() + * LOCK MB + * atomic_read() mutex_is_locked() + * + * LOCK should suffice since the actual taking of the lock must + * happen _before_ what follows. + */ + if (mutex_is_locked(&anon_vma->root->mutex)) { + anon_vma_lock(anon_vma); + anon_vma_unlock(anon_vma); + } + kmem_cache_free(anon_vma_cachep, anon_vma); } -static inline struct anon_vma_chain *anon_vma_chain_alloc(void) +static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) { - return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); + return kmem_cache_alloc(anon_vma_chain_cachep, gfp); } static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) @@ -122,7 +157,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) struct mm_struct *mm = vma->vm_mm; struct anon_vma *allocated; - avc = anon_vma_chain_alloc(); + avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_enomem; @@ -133,11 +168,6 @@ int anon_vma_prepare(struct vm_area_struct *vma) if (unlikely(!anon_vma)) goto out_enomem_free_avc; allocated = anon_vma; - /* - * This VMA had no anon_vma yet. This anon_vma is - * the root of any anon_vma tree that might form. - */ - anon_vma->root = anon_vma; } anon_vma_lock(anon_vma); @@ -156,7 +186,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) anon_vma_unlock(anon_vma); if (unlikely(allocated)) - anon_vma_free(allocated); + put_anon_vma(allocated); if (unlikely(avc)) anon_vma_chain_free(avc); } @@ -168,6 +198,32 @@ int anon_vma_prepare(struct vm_area_struct *vma) return -ENOMEM; } +/* + * This is a useful helper function for locking the anon_vma root as + * we traverse the vma->anon_vma_chain, looping over anon_vma's that + * have the same vma. + * + * Such anon_vma's should have the same root, so you'd expect to see + * just a single mutex_lock for the whole traversal. + */ +static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) +{ + struct anon_vma *new_root = anon_vma->root; + if (new_root != root) { + if (WARN_ON_ONCE(root)) + mutex_unlock(&root->mutex); + root = new_root; + mutex_lock(&root->mutex); + } + return root; +} + +static inline void unlock_anon_vma_root(struct anon_vma *root) +{ + if (root) + mutex_unlock(&root->mutex); +} + static void anon_vma_chain_link(struct vm_area_struct *vma, struct anon_vma_chain *avc, struct anon_vma *anon_vma) @@ -176,13 +232,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - anon_vma_lock(anon_vma); /* * It's critical to add new vmas to the tail of the anon_vma, * see comment in huge_memory.c:__split_huge_page(). */ list_add_tail(&avc->same_anon_vma, &anon_vma->head); - anon_vma_unlock(anon_vma); } /* @@ -192,13 +246,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; + struct anon_vma *root = NULL; list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { - avc = anon_vma_chain_alloc(); - if (!avc) - goto enomem_failure; - anon_vma_chain_link(dst, avc, pavc->anon_vma); + struct anon_vma *anon_vma; + + avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); + if (unlikely(!avc)) { + unlock_anon_vma_root(root); + root = NULL; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto enomem_failure; + } + anon_vma = pavc->anon_vma; + root = lock_anon_vma_root(root, anon_vma); + anon_vma_chain_link(dst, avc, anon_vma); } + unlock_anon_vma_root(root); return 0; enomem_failure: @@ -231,7 +296,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; - avc = anon_vma_chain_alloc(); + avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; @@ -241,58 +306,63 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) */ anon_vma->root = pvma->anon_vma->root; /* - * With KSM refcounts, an anon_vma can stay around longer than the - * process it belongs to. The root anon_vma needs to be pinned - * until this anon_vma is freed, because the lock lives in the root. + * With refcounts, an anon_vma can stay around longer than the + * process it belongs to. The root anon_vma needs to be pinned until + * this anon_vma is freed, because the lock lives in the root. */ get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; + anon_vma_lock(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_unlock(anon_vma); return 0; out_error_free_anon_vma: - anon_vma_free(anon_vma); + put_anon_vma(anon_vma); out_error: unlink_anon_vmas(vma); return -ENOMEM; } -static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) +void unlink_anon_vmas(struct vm_area_struct *vma) { - struct anon_vma *anon_vma = anon_vma_chain->anon_vma; - int empty; + struct anon_vma_chain *avc, *next; + struct anon_vma *root = NULL; - /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ - if (!anon_vma) - return; + /* + * Unlink each anon_vma chained to the VMA. This list is ordered + * from newest to oldest, ensuring the root anon_vma gets freed last. + */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; - anon_vma_lock(anon_vma); - list_del(&anon_vma_chain->same_anon_vma); + root = lock_anon_vma_root(root, anon_vma); + list_del(&avc->same_anon_vma); - /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); - anon_vma_unlock(anon_vma); + /* + * Leave empty anon_vmas on the list - we'll need + * to free them outside the lock. + */ + if (list_empty(&anon_vma->head)) + continue; - if (empty) { - /* We no longer need the root anon_vma */ - if (anon_vma->root != anon_vma) - drop_anon_vma(anon_vma->root); - anon_vma_free(anon_vma); + list_del(&avc->same_vma); + anon_vma_chain_free(avc); } -} - -void unlink_anon_vmas(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc, *next; + unlock_anon_vma_root(root); /* - * Unlink each anon_vma chained to the VMA. This list is ordered - * from newest to oldest, ensuring the root anon_vma gets freed last. + * Iterate the list once more, it now only contains empty and unlinked + * anon_vmas, destroy them. Could not do before due to __put_anon_vma() + * needing to acquire the anon_vma->root->mutex. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { - anon_vma_unlink(avc); + struct anon_vma *anon_vma = avc->anon_vma; + + put_anon_vma(anon_vma); + list_del(&avc->same_vma); anon_vma_chain_free(avc); } @@ -302,8 +372,8 @@ static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; - spin_lock_init(&anon_vma->lock); - anonvma_external_refcount_init(anon_vma); + mutex_init(&anon_vma->mutex); + atomic_set(&anon_vma->refcount, 0); INIT_LIST_HEAD(&anon_vma->head); } @@ -315,12 +385,31 @@ void __init anon_vma_init(void) } /* - * Getting a lock on a stable anon_vma from a page off the LRU is - * tricky: page_lock_anon_vma rely on RCU to guard against the races. + * Getting a lock on a stable anon_vma from a page off the LRU is tricky! + * + * Since there is no serialization what so ever against page_remove_rmap() + * the best this function can do is return a locked anon_vma that might + * have been relevant to this page. + * + * The page might have been remapped to a different anon_vma or the anon_vma + * returned may already be freed (and even reused). + * + * In case it was remapped to a different anon_vma, the new anon_vma will be a + * child of the old anon_vma, and the anon_vma lifetime rules will therefore + * ensure that any anon_vma obtained from the page will still be valid for as + * long as we observe page_mapped() [ hence all those page_mapped() tests ]. + * + * All users of this function must be very careful when walking the anon_vma + * chain and verify that the page in question is indeed mapped in it + * [ something equivalent to page_mapped_in_vma() ]. + * + * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() + * that the anon_vma pointer from page->mapping is valid if there is a + * mapcount, we can dereference the anon_vma after observing those. */ -struct anon_vma *__page_lock_anon_vma(struct page *page) +struct anon_vma *page_get_anon_vma(struct page *page) { - struct anon_vma *anon_vma, *root_anon_vma; + struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); @@ -331,32 +420,100 @@ struct anon_vma *__page_lock_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - root_anon_vma = ACCESS_ONCE(anon_vma->root); - spin_lock(&root_anon_vma->lock); + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } /* * If this page is still mapped, then its anon_vma cannot have been - * freed. But if it has been unmapped, we have no security against - * the anon_vma structure being freed and reused (for another anon_vma: - * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot - * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting - * anon_vma->root before page_unlock_anon_vma() is called to unlock. + * freed. But if it has been unmapped, we have no security against the + * anon_vma structure being freed and reused (for another anon_vma: + * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() + * above cannot corrupt). */ - if (page_mapped(page)) - return anon_vma; + if (!page_mapped(page)) { + put_anon_vma(anon_vma); + anon_vma = NULL; + } +out: + rcu_read_unlock(); + + return anon_vma; +} + +/* + * Similar to page_get_anon_vma() except it locks the anon_vma. + * + * Its a little more complex as it tries to keep the fast path to a single + * atomic op -- the trylock. If we fail the trylock, we fall back to getting a + * reference like with page_get_anon_vma() and then block on the mutex. + */ +struct anon_vma *page_lock_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma = NULL; + struct anon_vma *root_anon_vma; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + root_anon_vma = ACCESS_ONCE(anon_vma->root); + if (mutex_trylock(&root_anon_vma->mutex)) { + /* + * If the page is still mapped, then this anon_vma is still + * its anon_vma, and holding the mutex ensures that it will + * not go away, see anon_vma_free(). + */ + if (!page_mapped(page)) { + mutex_unlock(&root_anon_vma->mutex); + anon_vma = NULL; + } + goto out; + } + + /* trylock failed, we got to sleep */ + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } + + if (!page_mapped(page)) { + put_anon_vma(anon_vma); + anon_vma = NULL; + goto out; + } + + /* we pinned the anon_vma, its safe to sleep */ + rcu_read_unlock(); + anon_vma_lock(anon_vma); + + if (atomic_dec_and_test(&anon_vma->refcount)) { + /* + * Oops, we held the last refcount, release the lock + * and bail -- can't simply use put_anon_vma() because + * we'll deadlock on the anon_vma_lock() recursion. + */ + anon_vma_unlock(anon_vma); + __put_anon_vma(anon_vma); + anon_vma = NULL; + } + + return anon_vma; - spin_unlock(&root_anon_vma->lock); out: rcu_read_unlock(); - return NULL; + return anon_vma; } void page_unlock_anon_vma(struct anon_vma *anon_vma) - __releases(&anon_vma->root->lock) - __releases(RCU) { anon_vma_unlock(anon_vma); - rcu_read_unlock(); } /* @@ -497,41 +654,51 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int referenced = 0; - /* - * Don't want to elevate referenced for mlocked page that gets this far, - * in order that it progresses to try_to_unmap and is moved to the - * unevictable list. - */ - if (vma->vm_flags & VM_LOCKED) { - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; - } - - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; - if (unlikely(PageTransHuge(page))) { pmd_t *pmd; spin_lock(&mm->page_table_lock); + /* + * rmap might return false positives; we must filter + * these out using page_check_address_pmd(). + */ pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_FLAG); - if (pmd && !pmd_trans_splitting(*pmd) && - pmdp_clear_flush_young_notify(vma, address, pmd)) + if (!pmd) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + if (vma->vm_flags & VM_LOCKED) { + spin_unlock(&mm->page_table_lock); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + + /* go ahead even if the pmd is pmd_trans_splitting() */ + if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; spin_unlock(&mm->page_table_lock); } else { pte_t *pte; spinlock_t *ptl; + /* + * rmap might return false positives; we must filter + * these out using page_check_address(). + */ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; + if (vma->vm_flags & VM_LOCKED) { + pte_unmap_unlock(pte, ptl); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + if (ptep_clear_flush_young_notify(vma, address, pte)) { /* * Don't treat a reference through a sequentially read @@ -546,6 +713,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } + /* Pretend the page is referenced if the task has the + swap token and is in the middle of a page fault. */ + if (mm != current->mm && has_swap_token(mm) && + rwsem_is_locked(&mm->mmap_sem)) + referenced++; + (*mapcount)--; if (referenced) @@ -625,14 +798,14 @@ static int page_referenced_file(struct page *page, * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_lock. + * so we can safely take mapping->i_mmap_mutex. */ BUG_ON(!PageLocked(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); /* - * i_mmap_lock does not stabilize mapcount at all, but mapcount + * i_mmap_mutex does not stabilize mapcount at all, but mapcount * is more likely to be accurate if we note it after spinning. */ mapcount = page_mapcount(page); @@ -654,7 +827,7 @@ static int page_referenced_file(struct page *page, break; } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return referenced; } @@ -696,11 +869,11 @@ int page_referenced(struct page *page, vm_flags); if (we_locked) unlock_page(page); + + if (page_test_and_clear_young(page_to_pfn(page))) + referenced++; } out: - if (page_test_and_clear_young(page)) - referenced++; - return referenced; } @@ -741,7 +914,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) BUG_ON(PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); @@ -750,7 +923,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) ret += page_mkclean_one(page, vma, address); } } - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -764,10 +937,8 @@ int page_mkclean(struct page *page) struct address_space *mapping = page_mapping(page); if (mapping) { ret = page_mkclean_file(mapping, page); - if (page_test_dirty(page)) { - page_clear_dirty(page, 1); + if (page_test_and_clear_dirty(page_to_pfn(page), 1)) ret = 1; - } } } @@ -893,7 +1064,7 @@ void do_page_add_anon_rmap(struct page *page, return; VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* address might be in next vma when migration races vma_adjust */ if (first) __page_set_anon_rmap(page, vma, address, exclusive); else @@ -960,10 +1131,9 @@ void page_remove_rmap(struct page *page) * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. */ - if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { - page_clear_dirty(page, 1); + if ((!PageAnon(page) || PageSwapCache(page)) && + page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); - } /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. @@ -1101,7 +1271,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->lock or mapping->i_mmap_lock. + * we now hold anon_vma->mutex or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1327,7 +1497,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_size = 0; unsigned int mapcount; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1373,7 +1543,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mapcount = page_mapcount(page); if (!mapcount) goto out; - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; if (max_nl_cursor == 0) @@ -1395,7 +1565,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) } vma->vm_private_data = (void *) max_nl_cursor; } - cond_resched_lock(&mapping->i_mmap_lock); + cond_resched(); max_nl_cursor += CLUSTER_SIZE; } while (max_nl_cursor <= max_nl_size); @@ -1407,7 +1577,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_private_data = NULL; out: - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1470,41 +1640,15 @@ int try_to_munlock(struct page *page) return try_to_unmap_file(page, TTU_MUNLOCK); } -#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) -/* - * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root - * if necessary. Be careful to do all the tests under the lock. Once - * we know we are the last user, nobody else can get a reference and we - * can do the freeing without the lock. - */ -void drop_anon_vma(struct anon_vma *anon_vma) +void __put_anon_vma(struct anon_vma *anon_vma) { - BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); - if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { - struct anon_vma *root = anon_vma->root; - int empty = list_empty(&anon_vma->head); - int last_root_user = 0; - int root_empty = 0; + struct anon_vma *root = anon_vma->root; - /* - * The refcount on a non-root anon_vma got dropped. Drop - * the refcount on the root and check if we need to free it. - */ - if (empty && anon_vma != root) { - BUG_ON(atomic_read(&root->external_refcount) <= 0); - last_root_user = atomic_dec_and_test(&root->external_refcount); - root_empty = list_empty(&root->head); - } - anon_vma_unlock(anon_vma); + if (root != anon_vma && atomic_dec_and_test(&root->refcount)) + anon_vma_free(root); - if (empty) { - anon_vma_free(anon_vma); - if (root_empty && last_root_user) - anon_vma_free(root); - } - } + anon_vma_free(anon_vma); } -#endif #ifdef CONFIG_MIGRATION /* @@ -1552,7 +1696,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, if (!mapping) return ret; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1566,7 +1710,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, * never contain migration ptes. Decide what to do about this * limitation to linear when we need rmap_walk() on nonlinear. */ - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); return ret; } @@ -1615,7 +1759,7 @@ void hugepage_add_anon_rmap(struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(!anon_vma); - BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* address might be in next vma when migration races vma_adjust */ first = atomic_inc_and_test(&page->_mapcount); if (first) __hugepage_set_anon_rmap(page, vma, address, 0); diff --git a/mm/shmem.c b/mm/shmem.c index 5ee67c99060..32f6763f16f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -6,7 +6,8 @@ * 2000-2001 Christoph Rohland * 2000-2001 SAP AG * 2002 Red Hat Inc. - * Copyright (C) 2002-2005 Hugh Dickins. + * Copyright (C) 2002-2011 Hugh Dickins. + * Copyright (C) 2011 Google Inc. * Copyright (C) 2002-2005 VERITAS Software Corporation. * Copyright (C) 2004 Andi Kleen, SuSE Labs * @@ -28,7 +29,6 @@ #include <linux/file.h> #include <linux/mm.h> #include <linux/module.h> -#include <linux/percpu_counter.h> #include <linux/swap.h> static struct vfsmount *shm_mnt; @@ -51,6 +51,9 @@ static struct vfsmount *shm_mnt; #include <linux/shmem_fs.h> #include <linux/writeback.h> #include <linux/blkdev.h> +#include <linux/pagevec.h> +#include <linux/percpu_counter.h> +#include <linux/splice.h> #include <linux/security.h> #include <linux/swapops.h> #include <linux/mempolicy.h> @@ -62,44 +65,25 @@ static struct vfsmount *shm_mnt; #include <linux/magic.h> #include <asm/uaccess.h> -#include <asm/div64.h> #include <asm/pgtable.h> -/* - * The maximum size of a shmem/tmpfs file is limited by the maximum size of - * its triple-indirect swap vector - see illustration at shmem_swp_entry(). - * - * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, - * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum - * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, - * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. - * - * We use / and * instead of shifts in the definitions below, so that the swap - * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. - */ -#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) -#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) - -#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) -#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) - -#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) -#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) - #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) -/* info->flags needs VM_flags to handle pagein/truncate races efficiently */ -#define SHMEM_PAGEIN VM_READ -#define SHMEM_TRUNCATE VM_WRITE - -/* Definition to limit shmem_truncate's steps between cond_rescheds */ -#define LATENCY_LIMIT 64 - /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 -/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ +/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ +#define SHORT_SYMLINK_LEN 128 + +struct shmem_xattr { + struct list_head list; /* anchored by shmem_inode_info->xattr_list */ + char *name; /* xattr name */ + size_t size; + char value[0]; +}; + +/* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ @@ -119,57 +103,14 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type); - -static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) -{ - /* - * The above definition of ENTRIES_PER_PAGE, and the use of - * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: - * might be reconsidered if it ever diverges from PAGE_SIZE. - * - * Mobility flags are masked out as swap vectors cannot move - */ - return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, - PAGE_CACHE_SHIFT-PAGE_SHIFT); -} - -static inline void shmem_dir_free(struct page *page) -{ - __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); -} - -static struct page **shmem_dir_map(struct page *page) -{ - return (struct page **)kmap_atomic(page, KM_USER0); -} - -static inline void shmem_dir_unmap(struct page **dir) -{ - kunmap_atomic(dir, KM_USER0); -} +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); -static swp_entry_t *shmem_swp_map(struct page *page) +static inline int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, int *fault_type) { - return (swp_entry_t *)kmap_atomic(page, KM_USER1); -} - -static inline void shmem_swp_balance_unmap(void) -{ - /* - * When passing a pointer to an i_direct entry, to code which - * also handles indirect entries and so will shmem_swp_unmap, - * we must arrange for the preempt count to remain in balance. - * What kmap_atomic of a lowmem page does depends on config - * and architecture, so pretend to kmap_atomic some lowmem page. - */ - (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); -} - -static inline void shmem_swp_unmap(swp_entry_t *entry) -{ - kunmap_atomic(entry, KM_USER1); + return shmem_getpage_gfp(inode, index, pagep, sgp, + mapping_gfp_mask(inode->i_mapping), fault_type); } static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) @@ -224,23 +165,11 @@ static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, - .unplug_io_fn = default_unplug_io_fn, }; static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); -static void shmem_free_blocks(struct inode *inode, long pages) -{ - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo->max_blocks) { - percpu_counter_add(&sbinfo->used_blocks, -pages); - spin_lock(&inode->i_lock); - inode->i_blocks -= pages*BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); - } -} - static int shmem_reserve_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); @@ -267,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb) } /** - * shmem_recalc_inode - recalculate the size of an inode + * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * * We have to calculate the free blocks since the mm can drop @@ -285,530 +214,324 @@ static void shmem_recalc_inode(struct inode *inode) freed = info->alloced - info->swapped - inode->i_mapping->nrpages; if (freed > 0) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -freed); info->alloced -= freed; + inode->i_blocks -= freed * BLOCKS_PER_PAGE; shmem_unacct_blocks(info->flags, freed); - shmem_free_blocks(inode, freed); } } -/** - * shmem_swp_entry - find the swap vector position in the info structure - * @info: info structure for the inode - * @index: index of the page to find - * @page: optional page to add to the structure. Has to be preset to - * all zeros - * - * If there is no space allocated yet it will return NULL when - * page is NULL, else it will use the page for the needed block, - * setting it to NULL on return to indicate that it has been used. - * - * The swap vector is organized the following way: - * - * There are SHMEM_NR_DIRECT entries directly stored in the - * shmem_inode_info structure. So small files do not need an addional - * allocation. - * - * For pages with index > SHMEM_NR_DIRECT there is the pointer - * i_indirect which points to a page which holds in the first half - * doubly indirect blocks, in the second half triple indirect blocks: - * - * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the - * following layout (for SHMEM_NR_DIRECT == 16): - * - * i_indirect -> dir --> 16-19 - * | +-> 20-23 - * | - * +-->dir2 --> 24-27 - * | +-> 28-31 - * | +-> 32-35 - * | +-> 36-39 - * | - * +-->dir3 --> 40-43 - * +-> 44-47 - * +-> 48-51 - * +-> 52-55 +/* + * Replace item expected in radix tree by a new item, while holding tree lock. */ -static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) -{ - unsigned long offset; - struct page **dir; - struct page *subdir; +static int shmem_radix_tree_replace(struct address_space *mapping, + pgoff_t index, void *expected, void *replacement) +{ + void **pslot; + void *item = NULL; + + VM_BUG_ON(!expected); + pslot = radix_tree_lookup_slot(&mapping->page_tree, index); + if (pslot) + item = radix_tree_deref_slot_protected(pslot, + &mapping->tree_lock); + if (item != expected) + return -ENOENT; + if (replacement) + radix_tree_replace_slot(pslot, replacement); + else + radix_tree_delete(&mapping->page_tree, index); + return 0; +} - if (index < SHMEM_NR_DIRECT) { - shmem_swp_balance_unmap(); - return info->i_direct+index; - } - if (!info->i_indirect) { - if (page) { - info->i_indirect = *page; - *page = NULL; - } - return NULL; /* need another page */ - } +/* + * Like add_to_page_cache_locked, but error if expected item has gone. + */ +static int shmem_add_to_page_cache(struct page *page, + struct address_space *mapping, + pgoff_t index, gfp_t gfp, void *expected) +{ + int error = 0; - index -= SHMEM_NR_DIRECT; - offset = index % ENTRIES_PER_PAGE; - index /= ENTRIES_PER_PAGE; - dir = shmem_dir_map(info->i_indirect); - - if (index >= ENTRIES_PER_PAGE/2) { - index -= ENTRIES_PER_PAGE/2; - dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; - index %= ENTRIES_PER_PAGE; - subdir = *dir; - if (!subdir) { - if (page) { - *dir = *page; - *page = NULL; - } - shmem_dir_unmap(dir); - return NULL; /* need another page */ - } - shmem_dir_unmap(dir); - dir = shmem_dir_map(subdir); - } + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageSwapBacked(page)); - dir += index; - subdir = *dir; - if (!subdir) { - if (!page || !(subdir = *page)) { - shmem_dir_unmap(dir); - return NULL; /* need a page */ + if (!expected) + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + if (!error) { + page_cache_get(page); + page->mapping = mapping; + page->index = index; + + spin_lock_irq(&mapping->tree_lock); + if (!expected) + error = radix_tree_insert(&mapping->page_tree, + index, page); + else + error = shmem_radix_tree_replace(mapping, index, + expected, page); + if (!error) { + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + } else { + page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); } - *dir = subdir; - *page = NULL; + if (!expected) + radix_tree_preload_end(); } - shmem_dir_unmap(dir); - return shmem_swp_map(subdir) + offset; + if (error) + mem_cgroup_uncharge_cache_page(page); + return error; } -static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) +/* + * Like delete_from_page_cache, but substitutes swap for page. + */ +static void shmem_delete_from_page_cache(struct page *page, void *radswap) { - long incdec = value? 1: -1; + struct address_space *mapping = page->mapping; + int error; - entry->val = value; - info->swapped += incdec; - if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { - struct page *page = kmap_atomic_to_page(entry); - set_page_private(page, page_private(page) + incdec); - } + spin_lock_irq(&mapping->tree_lock); + error = shmem_radix_tree_replace(mapping, page->index, page, radswap); + page->mapping = NULL; + mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); + __dec_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); + BUG_ON(error); } -/** - * shmem_swp_alloc - get the position of the swap entry for the page. - * @info: info structure for the inode - * @index: index of the page to find - * @sgp: check and recheck i_size? skip allocation? - * - * If the entry does not exist, allocate it. +/* + * Like find_get_pages, but collecting swap entries as well as pages. */ -static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) -{ - struct inode *inode = &info->vfs_inode; - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - struct page *page = NULL; - swp_entry_t *entry; - - if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - return ERR_PTR(-EINVAL); - - while (!(entry = shmem_swp_entry(info, index, &page))) { - if (sgp == SGP_READ) - return shmem_swp_map(ZERO_PAGE(0)); - /* - * Test used_blocks against 1 less max_blocks, since we have 1 data - * page (and perhaps indirect index pages) yet to allocate: - * a waste to allocate index if we cannot allocate data. - */ - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) - return ERR_PTR(-ENOSPC); - percpu_counter_inc(&sbinfo->used_blocks); - spin_lock(&inode->i_lock); - inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); +static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, + pgoff_t start, unsigned int nr_pages, + struct page **pages, pgoff_t *indices) +{ + unsigned int i; + unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, indices, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * Otherwise, we must be storing a swap entry + * here as an exceptional entry: so return it + * without attempting to raise page count. + */ + goto export; } + if (!page_cache_get_speculative(page)) + goto repeat; - spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); - spin_lock(&info->lock); - - if (!page) { - shmem_free_blocks(inode, 1); - return ERR_PTR(-ENOMEM); - } - if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { - entry = ERR_PTR(-EINVAL); - break; + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; } - if (info->next_index <= index) - info->next_index = index + 1; +export: + indices[ret] = indices[i]; + pages[ret] = page; + ret++; } - if (page) { - /* another task gave its page, or truncated the file */ - shmem_free_blocks(inode, 1); - shmem_dir_free(page); - } - if (info->next_index <= index && !IS_ERR(entry)) - info->next_index = index + 1; - return entry; + if (unlikely(!ret && nr_found)) + goto restart; + rcu_read_unlock(); + return ret; } -/** - * shmem_free_swp - free some swap entries in a directory - * @dir: pointer to the directory - * @edir: pointer after last entry of the directory - * @punch_lock: pointer to spinlock when needed for the holepunch case +/* + * Remove swap entry from radix tree, free the swap and its page cache. */ -static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, - spinlock_t *punch_lock) -{ - spinlock_t *punch_unlock = NULL; - swp_entry_t *ptr; - int freed = 0; - - for (ptr = dir; ptr < edir; ptr++) { - if (ptr->val) { - if (unlikely(punch_lock)) { - punch_unlock = punch_lock; - punch_lock = NULL; - spin_lock(punch_unlock); - if (!ptr->val) - continue; - } - free_swap_and_cache(*ptr); - *ptr = (swp_entry_t){0}; - freed++; - } - } - if (punch_unlock) - spin_unlock(punch_unlock); - return freed; -} - -static int shmem_map_and_free_swp(struct page *subdir, int offset, - int limit, struct page ***dir, spinlock_t *punch_lock) -{ - swp_entry_t *ptr; - int freed = 0; - - ptr = shmem_swp_map(subdir); - for (; offset < limit; offset += LATENCY_LIMIT) { - int size = limit - offset; - if (size > LATENCY_LIMIT) - size = LATENCY_LIMIT; - freed += shmem_free_swp(ptr+offset, ptr+offset+size, - punch_lock); - if (need_resched()) { - shmem_swp_unmap(ptr); - if (*dir) { - shmem_dir_unmap(*dir); - *dir = NULL; - } - cond_resched(); - ptr = shmem_swp_map(subdir); - } - } - shmem_swp_unmap(ptr); - return freed; +static int shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) +{ + int error; + + spin_lock_irq(&mapping->tree_lock); + error = shmem_radix_tree_replace(mapping, index, radswap, NULL); + spin_unlock_irq(&mapping->tree_lock); + if (!error) + free_swap_and_cache(radix_to_swp_entry(radswap)); + return error; } -static void shmem_free_pages(struct list_head *next) +/* + * Pagevec may contain swap entries, so shuffle up pages before releasing. + */ +static void shmem_pagevec_release(struct pagevec *pvec) { - struct page *page; - int freed = 0; - - do { - page = container_of(next, struct page, lru); - next = next->next; - shmem_dir_free(page); - freed++; - if (freed >= LATENCY_LIMIT) { - cond_resched(); - freed = 0; - } - } while (next); + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; + pagevec_release(pvec); } -static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +/* + * Remove range of pages and swap entries from radix tree, and free them. + */ +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { + struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - unsigned long idx; - unsigned long size; - unsigned long limit; - unsigned long stage; - unsigned long diroff; - struct page **dir; - struct page *topdir; - struct page *middir; - struct page *subdir; - swp_entry_t *ptr; - LIST_HEAD(pages_to_free); - long nr_pages_to_free = 0; + pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + pgoff_t end = (lend >> PAGE_CACHE_SHIFT); + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; long nr_swaps_freed = 0; - int offset; - int freed; - int punch_hole; - spinlock_t *needs_lock; - spinlock_t *punch_lock; - unsigned long upper_limit; + pgoff_t index; + int i; + + BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + + pagevec_init(&pvec, 0); + index = start; + while (index <= end) { + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + pvec.pages, indices); + if (!pvec.nr) + break; + mem_cgroup_uncharge_start(); + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (idx >= info->next_index) - return; + index = indices[i]; + if (index > end) + break; - spin_lock(&info->lock); - info->flags |= SHMEM_TRUNCATE; - if (likely(end == (loff_t) -1)) { - limit = info->next_index; - upper_limit = SHMEM_MAX_INDEX; - info->next_index = idx; - needs_lock = NULL; - punch_hole = 0; - } else { - if (end + 1 >= inode->i_size) { /* we may free a little more */ - limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - upper_limit = SHMEM_MAX_INDEX; - } else { - limit = (end + 1) >> PAGE_CACHE_SHIFT; - upper_limit = limit; - } - needs_lock = &info->lock; - punch_hole = 1; - } + if (radix_tree_exceptional_entry(page)) { + nr_swaps_freed += !shmem_free_swap(mapping, + index, page); + continue; + } - topdir = info->i_indirect; - if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { - info->i_indirect = NULL; - nr_pages_to_free++; - list_add(&topdir->lru, &pages_to_free); + if (!trylock_page(page)) + continue; + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } + unlock_page(page); + } + shmem_pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + cond_resched(); + index++; } - spin_unlock(&info->lock); - if (info->swapped && idx < SHMEM_NR_DIRECT) { - ptr = info->i_direct; - size = limit; - if (size > SHMEM_NR_DIRECT) - size = SHMEM_NR_DIRECT; - nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); + if (partial) { + struct page *page = NULL; + shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); + if (page) { + zero_user_segment(page, partial, PAGE_CACHE_SIZE); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } } - /* - * If there are no indirect blocks or we are punching a hole - * below indirect blocks, nothing to be done. - */ - if (!topdir || limit <= SHMEM_NR_DIRECT) - goto done2; + index = start; + for ( ; ; ) { + cond_resched(); + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + pvec.pages, indices); + if (!pvec.nr) { + if (index == start) + break; + index = start; + continue; + } + if (index == start && indices[0] > end) { + shmem_pagevec_release(&pvec); + break; + } + mem_cgroup_uncharge_start(); + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; - /* - * The truncation case has already dropped info->lock, and we're safe - * because i_size and next_index have already been lowered, preventing - * access beyond. But in the punch_hole case, we still need to take - * the lock when updating the swap directory, because there might be - * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or - * shmem_writepage. However, whenever we find we can remove a whole - * directory page (not at the misaligned start or end of the range), - * we first NULLify its pointer in the level above, and then have no - * need to take the lock when updating its contents: needs_lock and - * punch_lock (either pointing to info->lock or NULL) manage this. - */ + index = indices[i]; + if (index > end) + break; - upper_limit -= SHMEM_NR_DIRECT; - limit -= SHMEM_NR_DIRECT; - idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; - offset = idx % ENTRIES_PER_PAGE; - idx -= offset; - - dir = shmem_dir_map(topdir); - stage = ENTRIES_PER_PAGEPAGE/2; - if (idx < ENTRIES_PER_PAGEPAGE/2) { - middir = topdir; - diroff = idx/ENTRIES_PER_PAGE; - } else { - dir += ENTRIES_PER_PAGE/2; - dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; - while (stage <= idx) - stage += ENTRIES_PER_PAGEPAGE; - middir = *dir; - if (*dir) { - diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % - ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; - if (!diroff && !offset && upper_limit >= stage) { - if (needs_lock) { - spin_lock(needs_lock); - *dir = NULL; - spin_unlock(needs_lock); - needs_lock = NULL; - } else - *dir = NULL; - nr_pages_to_free++; - list_add(&middir->lru, &pages_to_free); + if (radix_tree_exceptional_entry(page)) { + nr_swaps_freed += !shmem_free_swap(mapping, + index, page); + continue; } - shmem_dir_unmap(dir); - dir = shmem_dir_map(middir); - } else { - diroff = 0; - offset = 0; - idx = stage; - } - } - for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { - if (unlikely(idx == stage)) { - shmem_dir_unmap(dir); - dir = shmem_dir_map(topdir) + - ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; - while (!*dir) { - dir++; - idx += ENTRIES_PER_PAGEPAGE; - if (idx >= limit) - goto done1; - } - stage = idx + ENTRIES_PER_PAGEPAGE; - middir = *dir; - if (punch_hole) - needs_lock = &info->lock; - if (upper_limit >= stage) { - if (needs_lock) { - spin_lock(needs_lock); - *dir = NULL; - spin_unlock(needs_lock); - needs_lock = NULL; - } else - *dir = NULL; - nr_pages_to_free++; - list_add(&middir->lru, &pages_to_free); + lock_page(page); + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); } - shmem_dir_unmap(dir); - cond_resched(); - dir = shmem_dir_map(middir); - diroff = 0; - } - punch_lock = needs_lock; - subdir = dir[diroff]; - if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { - if (needs_lock) { - spin_lock(needs_lock); - dir[diroff] = NULL; - spin_unlock(needs_lock); - punch_lock = NULL; - } else - dir[diroff] = NULL; - nr_pages_to_free++; - list_add(&subdir->lru, &pages_to_free); - } - if (subdir && page_private(subdir) /* has swap entries */) { - size = limit - idx; - if (size > ENTRIES_PER_PAGE) - size = ENTRIES_PER_PAGE; - freed = shmem_map_and_free_swp(subdir, - offset, size, &dir, punch_lock); - if (!dir) - dir = shmem_dir_map(middir); - nr_swaps_freed += freed; - if (offset || punch_lock) { - spin_lock(&info->lock); - set_page_private(subdir, - page_private(subdir) - freed); - spin_unlock(&info->lock); - } else - BUG_ON(page_private(subdir) != freed); + unlock_page(page); } - offset = 0; - } -done1: - shmem_dir_unmap(dir); -done2: - if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { - /* - * Call truncate_inode_pages again: racing shmem_unuse_inode - * may have swizzled a page in from swap since - * truncate_pagecache or generic_delete_inode did it, before we - * lowered next_index. Also, though shmem_getpage checks - * i_size before adding to cache, no recheck after: so fix the - * narrow window there too. - * - * Recalling truncate_inode_pages_range and unmap_mapping_range - * every time for punch_hole (which never got a chance to clear - * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive, - * yet hardly ever necessary: try to optimize them out later. - */ - truncate_inode_pages_range(inode->i_mapping, start, end); - if (punch_hole) - unmap_mapping_range(inode->i_mapping, start, - end - start, 1); + shmem_pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + index++; } spin_lock(&info->lock); - info->flags &= ~SHMEM_TRUNCATE; info->swapped -= nr_swaps_freed; - if (nr_pages_to_free) - shmem_free_blocks(inode, nr_pages_to_free); shmem_recalc_inode(inode); spin_unlock(&info->lock); - /* - * Empty swap vector directory pages to be freed? - */ - if (!list_empty(&pages_to_free)) { - pages_to_free.prev->next = NULL; - shmem_free_pages(pages_to_free.next); - } + inode->i_ctime = inode->i_mtime = CURRENT_TIME; } +EXPORT_SYMBOL_GPL(shmem_truncate_range); -static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) +static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - loff_t newsize = attr->ia_size; int error; error = inode_change_ok(inode, attr); if (error) return error; - if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) - && newsize != inode->i_size) { - struct page *page = NULL; + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + loff_t oldsize = inode->i_size; + loff_t newsize = attr->ia_size; - if (newsize < inode->i_size) { - /* - * If truncating down to a partial page, then - * if that page is already allocated, hold it - * in memory until the truncation is over, so - * truncate_partial_page cannnot miss it were - * it assigned to swap. - */ - if (newsize & (PAGE_CACHE_SIZE-1)) { - (void) shmem_getpage(inode, - newsize >> PAGE_CACHE_SHIFT, - &page, SGP_READ, NULL); - if (page) - unlock_page(page); - } - /* - * Reset SHMEM_PAGEIN flag so that shmem_truncate can - * detect if any pages might have been added to cache - * after truncate_inode_pages. But we needn't bother - * if it's being fully truncated to zero-length: the - * nrpages check is efficient enough in that case. - */ - if (newsize) { - struct shmem_inode_info *info = SHMEM_I(inode); - spin_lock(&info->lock); - info->flags &= ~SHMEM_PAGEIN; - spin_unlock(&info->lock); - } + if (newsize != oldsize) { + i_size_write(inode, newsize); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + } + if (newsize < oldsize) { + loff_t holebegin = round_up(newsize, PAGE_SIZE); + unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + shmem_truncate_range(inode, newsize, (loff_t)-1); + /* unmap again to remove racily COWed private pages */ + unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); } - - /* XXX(truncate): truncate_setsize should be called last */ - truncate_setsize(inode, newsize); - if (page) - page_cache_release(page); - shmem_truncate_range(inode, newsize, (loff_t)-1); } setattr_copy(inode, attr); @@ -822,9 +545,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_xattr *xattr, *nxattr; if (inode->i_mapping->a_ops == &shmem_aops) { - truncate_inode_pages(inode->i_mapping, 0); shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; shmem_truncate_range(inode, 0, (loff_t)-1); @@ -833,200 +556,111 @@ static void shmem_evict_inode(struct inode *inode) list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } + } else + kfree(info->symlink); + + list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { + kfree(xattr->name); + kfree(xattr); } BUG_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); end_writeback(inode); } -static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) -{ - swp_entry_t *ptr; - - for (ptr = dir; ptr < edir; ptr++) { - if (ptr->val == entry.val) - return ptr - dir; - } - return -1; -} - -static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) +/* + * If swap found in inode, free it and move page from swapcache to filecache. + */ +static int shmem_unuse_inode(struct shmem_inode_info *info, + swp_entry_t swap, struct page *page) { - struct inode *inode; - unsigned long idx; - unsigned long size; - unsigned long limit; - unsigned long stage; - struct page **dir; - struct page *subdir; - swp_entry_t *ptr; - int offset; + struct address_space *mapping = info->vfs_inode.i_mapping; + void *radswap; + pgoff_t index; int error; - idx = 0; - ptr = info->i_direct; - spin_lock(&info->lock); - if (!info->swapped) { - list_del_init(&info->swaplist); - goto lost2; - } - limit = info->next_index; - size = limit; - if (size > SHMEM_NR_DIRECT) - size = SHMEM_NR_DIRECT; - offset = shmem_find_swp(entry, ptr, ptr+size); - if (offset >= 0) - goto found; - if (!info->i_indirect) - goto lost2; - - dir = shmem_dir_map(info->i_indirect); - stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; - - for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { - if (unlikely(idx == stage)) { - shmem_dir_unmap(dir-1); - if (cond_resched_lock(&info->lock)) { - /* check it has not been truncated */ - if (limit > info->next_index) { - limit = info->next_index; - if (idx >= limit) - goto lost2; - } - } - dir = shmem_dir_map(info->i_indirect) + - ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; - while (!*dir) { - dir++; - idx += ENTRIES_PER_PAGEPAGE; - if (idx >= limit) - goto lost1; - } - stage = idx + ENTRIES_PER_PAGEPAGE; - subdir = *dir; - shmem_dir_unmap(dir); - dir = shmem_dir_map(subdir); - } - subdir = *dir; - if (subdir && page_private(subdir)) { - ptr = shmem_swp_map(subdir); - size = limit - idx; - if (size > ENTRIES_PER_PAGE) - size = ENTRIES_PER_PAGE; - offset = shmem_find_swp(entry, ptr, ptr+size); - shmem_swp_unmap(ptr); - if (offset >= 0) { - shmem_dir_unmap(dir); - goto found; - } - } - } -lost1: - shmem_dir_unmap(dir-1); -lost2: - spin_unlock(&info->lock); - return 0; -found: - idx += offset; - inode = igrab(&info->vfs_inode); - spin_unlock(&info->lock); + radswap = swp_to_radix_entry(swap); + index = radix_tree_locate_item(&mapping->page_tree, radswap); + if (index == -1) + return 0; /* * Move _head_ to start search for next from here. * But be careful: shmem_evict_inode checks list_empty without taking * mutex, and there's an instant in list_move_tail when info->swaplist - * would appear empty, if it were the only one on shmem_swaplist. We - * could avoid doing it if inode NULL; or use this minor optimization. + * would appear empty, if it were the only one on shmem_swaplist. */ if (shmem_swaplist.next != &info->swaplist) list_move_tail(&shmem_swaplist, &info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); - error = 1; - if (!inode) - goto out; /* - * Charge page using GFP_KERNEL while we can wait. - * Charged back to the user(not to caller) when swap account is used. - * add_to_page_cache() will be called with GFP_NOWAIT. + * We rely on shmem_swaplist_mutex, not only to protect the swaplist, + * but also to hold up shmem_evict_inode(): so inode cannot be freed + * beneath us (pagelock doesn't help until the page is in pagecache). */ - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); - if (error) - goto out; - error = radix_tree_preload(GFP_KERNEL); - if (error) { - mem_cgroup_uncharge_cache_page(page); - goto out; - } - error = 1; + error = shmem_add_to_page_cache(page, mapping, index, + GFP_NOWAIT, radswap); + /* which does mem_cgroup_uncharge_cache_page on error */ - spin_lock(&info->lock); - ptr = shmem_swp_entry(info, idx, NULL); - if (ptr && ptr->val == entry.val) { - error = add_to_page_cache_locked(page, inode->i_mapping, - idx, GFP_NOWAIT); - /* does mem_cgroup_uncharge_cache_page on error */ - } else /* we must compensate for our precharge above */ - mem_cgroup_uncharge_cache_page(page); - - if (error == -EEXIST) { - struct page *filepage = find_get_page(inode->i_mapping, idx); - error = 1; - if (filepage) { - /* - * There might be a more uptodate page coming down - * from a stacked writepage: forget our swappage if so. - */ - if (PageUptodate(filepage)) - error = 0; - page_cache_release(filepage); - } - } - if (!error) { + if (error != -ENOMEM) { + /* + * Truncation and eviction use free_swap_and_cache(), which + * only does trylock page: if we raced, best clean up here. + */ delete_from_swap_cache(page); set_page_dirty(page); - info->flags |= SHMEM_PAGEIN; - shmem_swp_set(info, ptr, 0); - swap_free(entry); + if (!error) { + spin_lock(&info->lock); + info->swapped--; + spin_unlock(&info->lock); + swap_free(swap); + } error = 1; /* not an error, but entry was found */ } - if (ptr) - shmem_swp_unmap(ptr); - spin_unlock(&info->lock); - radix_tree_preload_end(); -out: - unlock_page(page); - page_cache_release(page); - iput(inode); /* allows for NULL */ return error; } /* - * shmem_unuse() search for an eventually swapped out shmem page. + * Search through swapped inodes to find and replace swap by page. */ -int shmem_unuse(swp_entry_t entry, struct page *page) +int shmem_unuse(swp_entry_t swap, struct page *page) { - struct list_head *p, *next; + struct list_head *this, *next; struct shmem_inode_info *info; int found = 0; + int error; + + /* + * Charge page using GFP_KERNEL while we can wait, before taking + * the shmem_swaplist_mutex which might hold up shmem_writepage(). + * Charged back to the user (not to caller) when swap account is used. + */ + error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); + if (error) + goto out; + /* No radix_tree_preload: swap entry keeps a place for page in tree */ mutex_lock(&shmem_swaplist_mutex); - list_for_each_safe(p, next, &shmem_swaplist) { - info = list_entry(p, struct shmem_inode_info, swaplist); - found = shmem_unuse_inode(info, entry, page); + list_for_each_safe(this, next, &shmem_swaplist) { + info = list_entry(this, struct shmem_inode_info, swaplist); + if (info->swapped) + found = shmem_unuse_inode(info, swap, page); + else + list_del_init(&info->swaplist); cond_resched(); if (found) - goto out; + break; } mutex_unlock(&shmem_swaplist_mutex); - /* - * Can some race bring us here? We've been holding page lock, - * so I think not; but would rather try again later than BUG() - */ + + if (!found) + mem_cgroup_uncharge_cache_page(page); + if (found < 0) + error = found; +out: unlock_page(page); page_cache_release(page); -out: - return (found < 0) ? found : 0; + return error; } /* @@ -1035,10 +669,10 @@ out: static int shmem_writepage(struct page *page, struct writeback_control *wbc) { struct shmem_inode_info *info; - swp_entry_t *entry, swap; struct address_space *mapping; - unsigned long index; struct inode *inode; + swp_entry_t swap; + pgoff_t index; BUG_ON(!PageLocked(page)); mapping = page->mapping; @@ -1053,63 +687,46 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * shmem_backing_dev_info's capabilities prevent regular writeback or * sync from ever calling shmem_writepage; but a stacking filesystem - * may use the ->writepage of its underlying filesystem, in which case + * might use ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for the writeback threads or sync. However, in those cases, - * we do still want to check if there's a redundant swappage to be - * discarded. + * and not for the writeback threads or sync. */ - if (wbc->for_reclaim) - swap = get_swap_page(); - else - swap.val = 0; - - spin_lock(&info->lock); - if (index >= info->next_index) { - BUG_ON(!(info->flags & SHMEM_TRUNCATE)); - goto unlock; - } - entry = shmem_swp_entry(info, index, NULL); - if (entry->val) { - /* - * The more uptodate page coming down from a stacked - * writepage should replace our old swappage. - */ - free_swap_and_cache(*entry); - shmem_swp_set(info, entry, 0); + if (!wbc->for_reclaim) { + WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ + goto redirty; } - shmem_recalc_inode(inode); + swap = get_swap_page(); + if (!swap.val) + goto redirty; - if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - remove_from_page_cache(page); - shmem_swp_set(info, entry, swap.val); - shmem_swp_unmap(entry); - if (list_empty(&info->swaplist)) - inode = igrab(inode); - else - inode = NULL; - spin_unlock(&info->lock); + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the page is + * moved to swap cache, when its pagelock no longer protects + * the inode from eviction. But don't unlock the mutex until + * we've incremented swapped, because shmem_unuse_inode() will + * prune a !swapped inode from the swaplist under this mutex. + */ + mutex_lock(&shmem_swaplist_mutex); + if (list_empty(&info->swaplist)) + list_add_tail(&info->swaplist, &shmem_swaplist); + + if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { swap_shmem_alloc(swap); + shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + + spin_lock(&info->lock); + info->swapped++; + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + + mutex_unlock(&shmem_swaplist_mutex); BUG_ON(page_mapped(page)); - page_cache_release(page); /* pagecache ref */ swap_writepage(page, wbc); - if (inode) { - mutex_lock(&shmem_swaplist_mutex); - /* move instead of add in case we're racing */ - list_move_tail(&info->swaplist, &shmem_swaplist); - mutex_unlock(&shmem_swaplist_mutex); - iput(inode); - } return 0; } - shmem_swp_unmap(entry); -unlock: - spin_unlock(&info->lock); - /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. - */ + mutex_unlock(&shmem_swaplist_mutex); swapcache_free(swap, NULL); redirty: set_page_dirty(page); @@ -1146,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } #endif /* CONFIG_TMPFS */ -static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) +static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) { struct mempolicy mpol, *spol; struct vm_area_struct pvma; - struct page *page; spol = mpol_cond_copy(&mpol, - mpol_shared_policy_lookup(&info->policy, idx)); + mpol_shared_policy_lookup(&info->policy, index)); /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = idx; + pvma.vm_pgoff = index; pvma.vm_ops = NULL; pvma.vm_policy = spol; - page = swapin_readahead(entry, gfp, &pvma, 0); - return page; + return swapin_readahead(swap, gfp, &pvma, 0); } static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) + struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = idx; + pvma.vm_pgoff = index; pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); /* * alloc_page_vma() will drop the shared policy reference @@ -1183,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp, } #else /* !CONFIG_NUMA */ #ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { } #endif /* CONFIG_TMPFS */ -static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) +static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) { - return swapin_readahead(entry, gfp, NULL, 0); + return swapin_readahead(swap, gfp, NULL, 0); } static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) + struct shmem_inode_info *info, pgoff_t index) { return alloc_page(gfp); } @@ -1209,300 +824,195 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) #endif /* - * shmem_getpage - either get the page from swap or allocate a new one + * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type) +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; - struct page *filepage = *pagep; - struct page *swappage; - struct page *prealloc_page = NULL; - swp_entry_t *entry; + struct page *page; swp_entry_t swap; - gfp_t gfp; int error; + int once = 0; - if (idx >= SHMEM_MAX_INDEX) + if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) return -EFBIG; +repeat: + swap.val = 0; + page = find_lock_page(mapping, index); + if (radix_tree_exceptional_entry(page)) { + swap = radix_to_swp_entry(page); + page = NULL; + } - if (type) - *type = 0; + if (sgp != SGP_WRITE && + ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + error = -EINVAL; + goto failed; + } - /* - * Normally, filepage is NULL on entry, and either found - * uptodate immediately, or allocated and zeroed, or read - * in under swappage, which is then assigned to filepage. - * But shmem_readpage (required for splice) passes in a locked - * filepage, which may be found not uptodate by other callers - * too, and may need to be copied from the swappage read in. - */ -repeat: - if (!filepage) - filepage = find_lock_page(mapping, idx); - if (filepage && PageUptodate(filepage)) - goto done; - gfp = mapping_gfp_mask(mapping); - if (!filepage) { + if (page || (sgp == SGP_READ && !swap.val)) { /* - * Try to preload while we can wait, to not make a habit of - * draining atomic reserves; but don't latch on to this cpu. + * Once we can get the page lock, it must be uptodate: + * if there were an error in reading back from swap, + * the page would not be inserted into the filecache. */ - error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); - if (error) - goto failed; - radix_tree_preload_end(); - if (sgp != SGP_READ && !prealloc_page) { - /* We don't care if this fails */ - prealloc_page = shmem_alloc_page(gfp, info, idx); - if (prealloc_page) { - if (mem_cgroup_cache_charge(prealloc_page, - current->mm, GFP_KERNEL)) { - page_cache_release(prealloc_page); - prealloc_page = NULL; - } - } - } + BUG_ON(page && !PageUptodate(page)); + *pagep = page; + return 0; } - error = 0; - spin_lock(&info->lock); - shmem_recalc_inode(inode); - entry = shmem_swp_alloc(info, idx, sgp); - if (IS_ERR(entry)) { - spin_unlock(&info->lock); - error = PTR_ERR(entry); - goto failed; - } - swap = *entry; + /* + * Fast cache lookup did not find it: + * bring it back from swap or allocate. + */ + info = SHMEM_I(inode); + sbinfo = SHMEM_SB(inode->i_sb); if (swap.val) { /* Look it up and read it in.. */ - swappage = lookup_swap_cache(swap); - if (!swappage) { - shmem_swp_unmap(entry); + page = lookup_swap_cache(swap); + if (!page) { /* here we actually do the io */ - if (type && !(*type & VM_FAULT_MAJOR)) { - __count_vm_event(PGMAJFAULT); - *type |= VM_FAULT_MAJOR; - } - spin_unlock(&info->lock); - swappage = shmem_swapin(swap, gfp, info, idx); - if (!swappage) { - spin_lock(&info->lock); - entry = shmem_swp_alloc(info, idx, sgp); - if (IS_ERR(entry)) - error = PTR_ERR(entry); - else { - if (entry->val == swap.val) - error = -ENOMEM; - shmem_swp_unmap(entry); - } - spin_unlock(&info->lock); - if (error) - goto failed; - goto repeat; + if (fault_type) + *fault_type |= VM_FAULT_MAJOR; + page = shmem_swapin(swap, gfp, info, index); + if (!page) { + error = -ENOMEM; + goto failed; } - wait_on_page_locked(swappage); - page_cache_release(swappage); - goto repeat; } /* We have to do this with page locked to prevent races */ - if (!trylock_page(swappage)) { - shmem_swp_unmap(entry); - spin_unlock(&info->lock); - wait_on_page_locked(swappage); - page_cache_release(swappage); - goto repeat; - } - if (PageWriteback(swappage)) { - shmem_swp_unmap(entry); - spin_unlock(&info->lock); - wait_on_page_writeback(swappage); - unlock_page(swappage); - page_cache_release(swappage); - goto repeat; - } - if (!PageUptodate(swappage)) { - shmem_swp_unmap(entry); - spin_unlock(&info->lock); - unlock_page(swappage); - page_cache_release(swappage); + lock_page(page); + if (!PageUptodate(page)) { error = -EIO; goto failed; } - - if (filepage) { - shmem_swp_set(info, entry, 0); - shmem_swp_unmap(entry); - delete_from_swap_cache(swappage); - spin_unlock(&info->lock); - copy_highpage(filepage, swappage); - unlock_page(swappage); - page_cache_release(swappage); - flush_dcache_page(filepage); - SetPageUptodate(filepage); - set_page_dirty(filepage); - swap_free(swap); - } else if (!(error = add_to_page_cache_locked(swappage, mapping, - idx, GFP_NOWAIT))) { - info->flags |= SHMEM_PAGEIN; - shmem_swp_set(info, entry, 0); - shmem_swp_unmap(entry); - delete_from_swap_cache(swappage); - spin_unlock(&info->lock); - filepage = swappage; - set_page_dirty(filepage); - swap_free(swap); - } else { - shmem_swp_unmap(entry); - spin_unlock(&info->lock); - if (error == -ENOMEM) { - /* - * reclaim from proper memory cgroup and - * call memcg's OOM if needed. - */ - error = mem_cgroup_shmem_charge_fallback( - swappage, - current->mm, - gfp); - if (error) { - unlock_page(swappage); - page_cache_release(swappage); - goto failed; - } - } - unlock_page(swappage); - page_cache_release(swappage); - goto repeat; - } - } else if (sgp == SGP_READ && !filepage) { - shmem_swp_unmap(entry); - filepage = find_get_page(mapping, idx); - if (filepage && - (!PageUptodate(filepage) || !trylock_page(filepage))) { - spin_unlock(&info->lock); - wait_on_page_locked(filepage); - page_cache_release(filepage); - filepage = NULL; - goto repeat; + wait_on_page_writeback(page); + + /* Someone may have already done it for us */ + if (page->mapping) { + if (page->mapping == mapping && + page->index == index) + goto done; + error = -EEXIST; + goto failed; } + + error = mem_cgroup_cache_charge(page, current->mm, + gfp & GFP_RECLAIM_MASK); + if (!error) + error = shmem_add_to_page_cache(page, mapping, index, + gfp, swp_to_radix_entry(swap)); + if (error) + goto failed; + + spin_lock(&info->lock); + info->swapped--; + shmem_recalc_inode(inode); spin_unlock(&info->lock); + + delete_from_swap_cache(page); + set_page_dirty(page); + swap_free(swap); + } else { - shmem_swp_unmap(entry); - sbinfo = SHMEM_SB(inode->i_sb); + if (shmem_acct_block(info->flags)) { + error = -ENOSPC; + goto failed; + } if (sbinfo->max_blocks) { - if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || - shmem_acct_block(info->flags)) { - spin_unlock(&info->lock); + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks) >= 0) { error = -ENOSPC; - goto failed; + goto unacct; } percpu_counter_inc(&sbinfo->used_blocks); - spin_lock(&inode->i_lock); - inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); - } else if (shmem_acct_block(info->flags)) { - spin_unlock(&info->lock); - error = -ENOSPC; - goto failed; } - if (!filepage) { - int ret; - - if (!prealloc_page) { - spin_unlock(&info->lock); - filepage = shmem_alloc_page(gfp, info, idx); - if (!filepage) { - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - error = -ENOMEM; - goto failed; - } - SetPageSwapBacked(filepage); - - /* - * Precharge page while we can wait, compensate - * after - */ - error = mem_cgroup_cache_charge(filepage, - current->mm, GFP_KERNEL); - if (error) { - page_cache_release(filepage); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - filepage = NULL; - goto failed; - } - - spin_lock(&info->lock); - } else { - filepage = prealloc_page; - prealloc_page = NULL; - SetPageSwapBacked(filepage); - } - - entry = shmem_swp_alloc(info, idx, sgp); - if (IS_ERR(entry)) - error = PTR_ERR(entry); - else { - swap = *entry; - shmem_swp_unmap(entry); - } - ret = error || swap.val; - if (ret) - mem_cgroup_uncharge_cache_page(filepage); - else - ret = add_to_page_cache_lru(filepage, mapping, - idx, GFP_NOWAIT); - /* - * At add_to_page_cache_lru() failure, uncharge will - * be done automatically. - */ - if (ret) { - spin_unlock(&info->lock); - page_cache_release(filepage); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - filepage = NULL; - if (error) - goto failed; - goto repeat; - } - info->flags |= SHMEM_PAGEIN; + page = shmem_alloc_page(gfp, info, index); + if (!page) { + error = -ENOMEM; + goto decused; } + SetPageSwapBacked(page); + __set_page_locked(page); + error = mem_cgroup_cache_charge(page, current->mm, + gfp & GFP_RECLAIM_MASK); + if (!error) + error = shmem_add_to_page_cache(page, mapping, index, + gfp, NULL); + if (error) + goto decused; + lru_cache_add_anon(page); + + spin_lock(&info->lock); info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); spin_unlock(&info->lock); - clear_highpage(filepage); - flush_dcache_page(filepage); - SetPageUptodate(filepage); + + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); if (sgp == SGP_DIRTY) - set_page_dirty(filepage); + set_page_dirty(page); } done: - *pagep = filepage; - error = 0; - goto out; + /* Perhaps the file has been truncated since we checked */ + if (sgp != SGP_WRITE && + ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + error = -EINVAL; + goto trunc; + } + *pagep = page; + return 0; + /* + * Error recovery. + */ +trunc: + ClearPageDirty(page); + delete_from_page_cache(page); + spin_lock(&info->lock); + info->alloced--; + inode->i_blocks -= BLOCKS_PER_PAGE; + spin_unlock(&info->lock); +decused: + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -1); +unacct: + shmem_unacct_blocks(info->flags, 1); failed: - if (*pagep != filepage) { - unlock_page(filepage); - page_cache_release(filepage); + if (swap.val && error != -EINVAL) { + struct page *test = find_get_page(mapping, index); + if (test && !radix_tree_exceptional_entry(test)) + page_cache_release(test); + /* Have another try if the entry has changed */ + if (test != swp_to_radix_entry(swap)) + error = -EEXIST; } -out: - if (prealloc_page) { - mem_cgroup_uncharge_cache_page(prealloc_page); - page_cache_release(prealloc_page); + if (page) { + unlock_page(page); + page_cache_release(page); + } + if (error == -ENOSPC && !once++) { + info = SHMEM_I(inode); + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + goto repeat; } + if (error == -EEXIST) + goto repeat; return error; } @@ -1510,33 +1020,34 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; int error; - int ret; - - if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - return VM_FAULT_SIGBUS; + int ret = VM_FAULT_LOCKED; error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - return ret | VM_FAULT_LOCKED; + if (ret & VM_FAULT_MAJOR) { + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + } + return ret; } #ifdef CONFIG_NUMA -static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct inode *i = vma->vm_file->f_path.dentry->d_inode; - return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) { - struct inode *i = vma->vm_file->f_path.dentry->d_inode; - unsigned long idx; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + pgoff_t index; - idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); + index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } #endif @@ -1597,6 +1108,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); + INIT_LIST_HEAD(&info->xattr_list); cache_no_acl(inode); switch (mode & S_IFMT) { @@ -1633,20 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; -static const struct inode_operations shmem_symlink_inline_operations; - -/* - * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; - * but providing them allows a tmpfs file to be used for splice, sendfile, and - * below the loop driver, in the generic fashion that many filesystems support. - */ -static int shmem_readpage(struct file *file, struct page *page) -{ - struct inode *inode = page->mapping->host; - int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); - unlock_page(page); - return error; -} +static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(struct file *file, struct address_space *mapping, @@ -1655,7 +1154,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = NULL; return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } @@ -1680,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ { struct inode *inode = filp->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; - unsigned long index, offset; + pgoff_t index; + unsigned long offset; enum sgp_type sgp = SGP_READ; /* @@ -1696,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ for (;;) { struct page *page = NULL; - unsigned long end_index, nr, ret; + pgoff_t end_index; + unsigned long nr, ret; loff_t i_size = i_size_read(inode); end_index = i_size >> PAGE_CACHE_SHIFT; @@ -1812,6 +1312,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, return retval; } +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct address_space *mapping = in->f_mapping; + struct inode *inode = mapping->host; + unsigned int loff, nr_pages, req_pages; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *page; + pgoff_t index, end_index; + loff_t isize, left; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + isize = i_size_read(inode); + if (unlikely(*ppos >= isize)) + return 0; + + left = isize - *ppos; + if (unlikely(left < len)) + len = left; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nr_pages = min(req_pages, pipe->buffers); + + spd.nr_pages = find_get_pages_contig(mapping, index, + nr_pages, spd.pages); + index += spd.nr_pages; + error = 0; + + while (spd.nr_pages < nr_pages) { + error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); + if (error) + break; + unlock_page(page); + spd.pages[spd.nr_pages++] = page; + index++; + } + + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = spd.pages[page_nr]; + + if (!PageUptodate(page) || page->mapping != mapping) { + error = shmem_getpage(inode, index, &page, + SGP_CACHE, NULL); + if (error) + break; + unlock_page(page); + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; + } + + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + if (end_index == index) { + unsigned int plen; + + plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (plen <= loff) + break; + + this_len = min(this_len, plen - loff); + len = this_len; + } + + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; + len -= this_len; + loff = 0; + spd.nr_pages++; + index++; + } + + while (page_nr < nr_pages) + page_cache_release(spd.pages[page_nr++]); + + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(pipe, &spd); + + if (error > 0) { + *ppos += error; + file_accessed(in); + } + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -1821,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; - buf->f_bavail = buf->f_bfree = - sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); + buf->f_bavail = + buf->f_bfree = sbinfo->max_blocks - + percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; @@ -1843,8 +1457,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { - error = security_inode_init_security(inode, dir, NULL, NULL, - NULL); + error = security_inode_init_security(inode, dir, + &dentry->d_name, NULL, + NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); @@ -1971,7 +1586,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s int error; int len; struct inode *inode; - struct page *page = NULL; + struct page *page; char *kaddr; struct shmem_inode_info *info; @@ -1983,8 +1598,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (!inode) return -ENOSPC; - error = security_inode_init_security(inode, dir, NULL, NULL, - NULL); + error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, + NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); @@ -1995,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s info = SHMEM_I(inode); inode->i_size = len-1; - if (len <= (char *)inode - (char *)info) { - /* do it inline */ - memcpy(info, symname, len); - inode->i_op = &shmem_symlink_inline_operations; + if (len <= SHORT_SYMLINK_LEN) { + info->symlink = kmemdup(symname, len, GFP_KERNEL); + if (!info->symlink) { + iput(inode); + return -ENOMEM; + } + inode->i_op = &shmem_short_symlink_operations; } else { error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); if (error) { @@ -2021,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return 0; } -static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) +static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode)); + nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); return NULL; } static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) { struct page *page = NULL; - int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); - nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); + int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); + nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); if (page) unlock_page(page); return page; @@ -2047,63 +1665,252 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co } } -static const struct inode_operations shmem_symlink_inline_operations = { - .readlink = generic_readlink, - .follow_link = shmem_follow_link_inline, -}; - -static const struct inode_operations shmem_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = shmem_follow_link, - .put_link = shmem_put_link, -}; - -#ifdef CONFIG_TMPFS_POSIX_ACL +#ifdef CONFIG_TMPFS_XATTR /* - * Superblocks without xattr inode operations will get security.* xattr - * support from the VFS "for free". As soon as we have any other xattrs + * Superblocks without xattr inode operations may get some security.* xattr + * support from the LSM "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at * filesystem level, though. */ -static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, - size_t list_len, const char *name, - size_t name_len, int handler_flags) +static int shmem_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t size) { - return security_inode_listsecurity(dentry->d_inode, list, list_len); -} + struct shmem_inode_info *info; + struct shmem_xattr *xattr; + int ret = -ENODATA; -static int shmem_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int handler_flags) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return xattr_getsecurity(dentry->d_inode, name, buffer, size); + info = SHMEM_I(dentry->d_inode); + + spin_lock(&info->lock); + list_for_each_entry(xattr, &info->xattr_list, list) { + if (strcmp(name, xattr->name)) + continue; + + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, xattr->size); + } + break; + } + spin_unlock(&info->lock); + return ret; } -static int shmem_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int handler_flags) +static int shmem_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; - return security_inode_setsecurity(dentry->d_inode, name, value, - size, flags); -} + struct inode *inode = dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_xattr *xattr; + struct shmem_xattr *new_xattr = NULL; + size_t len; + int err = 0; + + /* value == NULL means remove */ + if (value) { + /* wrap around? */ + len = sizeof(*new_xattr) + size; + if (len <= sizeof(*new_xattr)) + return -ENOMEM; + + new_xattr = kmalloc(len, GFP_KERNEL); + if (!new_xattr) + return -ENOMEM; + + new_xattr->name = kstrdup(name, GFP_KERNEL); + if (!new_xattr->name) { + kfree(new_xattr); + return -ENOMEM; + } -static const struct xattr_handler shmem_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .list = shmem_xattr_security_list, - .get = shmem_xattr_security_get, - .set = shmem_xattr_security_set, -}; + new_xattr->size = size; + memcpy(new_xattr->value, value, size); + } + + spin_lock(&info->lock); + list_for_each_entry(xattr, &info->xattr_list, list) { + if (!strcmp(name, xattr->name)) { + if (flags & XATTR_CREATE) { + xattr = new_xattr; + err = -EEXIST; + } else if (new_xattr) { + list_replace(&xattr->list, &new_xattr->list); + } else { + list_del(&xattr->list); + } + goto out; + } + } + if (flags & XATTR_REPLACE) { + xattr = new_xattr; + err = -ENODATA; + } else { + list_add(&new_xattr->list, &info->xattr_list); + xattr = NULL; + } +out: + spin_unlock(&info->lock); + if (xattr) + kfree(xattr->name); + kfree(xattr); + return err; +} static const struct xattr_handler *shmem_xattr_handlers[] = { +#ifdef CONFIG_TMPFS_POSIX_ACL &generic_acl_access_handler, &generic_acl_default_handler, - &shmem_xattr_security_handler, +#endif NULL }; + +static int shmem_xattr_validate(const char *name) +{ + struct { const char *prefix; size_t len; } arr[] = { + { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, + { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } + }; + int i; + + for (i = 0; i < ARRAY_SIZE(arr); i++) { + size_t preflen = arr[i].len; + if (strncmp(name, arr[i].prefix, preflen) == 0) { + if (!name[preflen]) + return -EINVAL; + return 0; + } + } + return -EOPNOTSUPP; +} + +static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + int err; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, buffer, size); + + err = shmem_xattr_validate(name); + if (err) + return err; + + return shmem_xattr_get(dentry, name, buffer, size); +} + +static int shmem_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + err = shmem_xattr_validate(name); + if (err) + return err; + + if (size == 0) + value = ""; /* empty EA, do not remove */ + + return shmem_xattr_set(dentry, name, value, size, flags); + +} + +static int shmem_removexattr(struct dentry *dentry, const char *name) +{ + int err; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + err = shmem_xattr_validate(name); + if (err) + return err; + + return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); +} + +static bool xattr_is_trusted(const char *name) +{ + return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); +} + +static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + bool trusted = capable(CAP_SYS_ADMIN); + struct shmem_xattr *xattr; + struct shmem_inode_info *info; + size_t used = 0; + + info = SHMEM_I(dentry->d_inode); + + spin_lock(&info->lock); + list_for_each_entry(xattr, &info->xattr_list, list) { + size_t len; + + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; + + len = strlen(xattr->name) + 1; + used += len; + if (buffer) { + if (size < used) { + used = -ERANGE; + break; + } + memcpy(buffer, xattr->name, len); + buffer += len; + } + } + spin_unlock(&info->lock); + + return used; +} +#endif /* CONFIG_TMPFS_XATTR */ + +static const struct inode_operations shmem_short_symlink_operations = { + .readlink = generic_readlink, + .follow_link = shmem_follow_short_symlink, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, #endif +}; + +static const struct inode_operations shmem_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = shmem_follow_link, + .put_link = shmem_put_link, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, +#endif +}; static struct dentry *shmem_get_parent(struct dentry *child) { @@ -2144,8 +1951,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, { struct inode *inode = dentry->d_inode; - if (*len < 3) + if (*len < 3) { + *len = 3; return 255; + } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, @@ -2282,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) if (config.max_inodes < inodes) goto out; /* - * Those tests also disallow limited->unlimited while any are in - * use, so i_blocks will always be zero when max_blocks is zero; + * Those tests disallow limited->unlimited while any are in use; * but we must separately disallow unlimited->limited, because * in that case we have no record of how much is already in use. */ @@ -2375,14 +2183,16 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; - sb->s_maxbytes = SHMEM_MAX_BYTES; + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; -#ifdef CONFIG_TMPFS_POSIX_ACL +#ifdef CONFIG_TMPFS_XATTR sb->s_xattr = shmem_xattr_handlers; +#endif +#ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif @@ -2408,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { - struct shmem_inode_info *p; - p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); - if (!p) + struct shmem_inode_info *info; + info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); + if (!info) return NULL; - return &p->vfs_inode; + return &info->vfs_inode; } -static void shmem_i_callback(struct rcu_head *head) +static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); INIT_LIST_HEAD(&inode->i_dentry); @@ -2424,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head) static void shmem_destroy_inode(struct inode *inode) { - if ((inode->i_mode & S_IFMT) == S_IFREG) { - /* only struct inode is valid if it's an inline symlink */ + if ((inode->i_mode & S_IFMT) == S_IFREG) mpol_free_shared_policy(&SHMEM_I(inode)->policy); - } - call_rcu(&inode->i_rcu, shmem_i_callback); + call_rcu(&inode->i_rcu, shmem_destroy_callback); } -static void init_once(void *foo) +static void shmem_init_inode(void *foo) { - struct shmem_inode_info *p = (struct shmem_inode_info *) foo; - - inode_init_once(&p->vfs_inode); + struct shmem_inode_info *info = foo; + inode_init_once(&info->vfs_inode); } -static int init_inodecache(void) +static int shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, SLAB_PANIC, init_once); + 0, SLAB_PANIC, shmem_init_inode); return 0; } -static void destroy_inodecache(void) +static void shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } @@ -2455,7 +2262,6 @@ static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS - .readpage = shmem_readpage, .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif @@ -2472,22 +2278,20 @@ static const struct file_operations shmem_file_operations = { .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = shmem_file_splice_read, .splice_write = generic_file_splice_write, #endif }; static const struct inode_operations shmem_inode_operations = { - .setattr = shmem_notify_change, + .setattr = shmem_setattr, .truncate_range = shmem_truncate_range, -#ifdef CONFIG_TMPFS_POSIX_ACL - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = generic_listxattr, - .removexattr = generic_removexattr, - .check_acl = generic_check_acl, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, #endif - }; static const struct inode_operations shmem_dir_inode_operations = { @@ -2502,24 +2306,26 @@ static const struct inode_operations shmem_dir_inode_operations = { .mknod = shmem_mknod, .rename = shmem_rename, #endif +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, +#endif #ifdef CONFIG_TMPFS_POSIX_ACL - .setattr = shmem_notify_change, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = generic_listxattr, - .removexattr = generic_removexattr, - .check_acl = generic_check_acl, + .setattr = shmem_setattr, #endif }; static const struct inode_operations shmem_special_inode_operations = { +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, +#endif #ifdef CONFIG_TMPFS_POSIX_ACL - .setattr = shmem_notify_change, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = generic_listxattr, - .removexattr = generic_removexattr, - .check_acl = generic_check_acl, + .setattr = shmem_setattr, #endif }; @@ -2544,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; - static struct dentry *shmem_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_nodev(fs_type, flags, data, shmem_fill_super); } -static struct file_system_type tmpfs_fs_type = { +static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .mount = shmem_mount, .kill_sb = kill_litter_super, }; -int __init init_tmpfs(void) +int __init shmem_init(void) { int error; @@ -2566,18 +2371,18 @@ int __init init_tmpfs(void) if (error) goto out4; - error = init_inodecache(); + error = shmem_init_inodecache(); if (error) goto out3; - error = register_filesystem(&tmpfs_fs_type); + error = register_filesystem(&shmem_fs_type); if (error) { printk(KERN_ERR "Could not register tmpfs\n"); goto out2; } - shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, - tmpfs_fs_type.name, NULL); + shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, + shmem_fs_type.name, NULL); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); printk(KERN_ERR "Could not kern_mount tmpfs\n"); @@ -2586,9 +2391,9 @@ int __init init_tmpfs(void) return 0; out1: - unregister_filesystem(&tmpfs_fs_type); + unregister_filesystem(&shmem_fs_type); out2: - destroy_inodecache(); + shmem_destroy_inodecache(); out3: bdi_destroy(&shmem_backing_dev_info); out4: @@ -2596,45 +2401,6 @@ out4: return error; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file - * @inode: the inode to be searched - * @pgoff: the offset to be searched - * @pagep: the pointer for the found page to be stored - * @ent: the pointer for the found swap entry to be stored - * - * If a page is found, refcount of it is incremented. Callers should handle - * these refcount. - */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, - struct page **pagep, swp_entry_t *ent) -{ - swp_entry_t entry = { .val = 0 }, *ptr; - struct page *page = NULL; - struct shmem_inode_info *info = SHMEM_I(inode); - - if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - goto out; - - spin_lock(&info->lock); - ptr = shmem_swp_entry(info, pgoff, NULL); -#ifdef CONFIG_SWAP - if (ptr && ptr->val) { - entry.val = ptr->val; - page = find_get_page(&swapper_space, entry.val); - } else -#endif - page = find_get_page(inode->i_mapping, pgoff); - if (ptr) - shmem_swp_unmap(ptr); - spin_unlock(&info->lock); -out: - *pagep = page; - *ent = entry; -} -#endif - #else /* !CONFIG_SHMEM */ /* @@ -2648,23 +2414,23 @@ out: #include <linux/ramfs.h> -static struct file_system_type tmpfs_fs_type = { +static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = ramfs_mount, .kill_sb = kill_litter_super, }; -int __init init_tmpfs(void) +int __init shmem_init(void) { - BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); + BUG_ON(register_filesystem(&shmem_fs_type) != 0); - shm_mnt = kern_mount(&tmpfs_fs_type); + shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); return 0; } -int shmem_unuse(swp_entry_t entry, struct page *page) +int shmem_unuse(swp_entry_t swap, struct page *page) { return 0; } @@ -2674,37 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file - * @inode: the inode to be searched - * @pgoff: the offset to be searched - * @pagep: the pointer for the found page to be stored - * @ent: the pointer for the found swap entry to be stored - * - * If a page is found, refcount of it is incremented. Callers should handle - * these refcount. - */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, - struct page **pagep, swp_entry_t *ent) +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { - struct page *page = NULL; - - if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - goto out; - page = find_get_page(inode->i_mapping, pgoff); -out: - *pagep = page; - *ent = (swp_entry_t){ .val = 0 }; + truncate_inode_pages_range(inode->i_mapping, lstart, lend); } -#endif +EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) -#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE #endif /* CONFIG_SHMEM */ @@ -2728,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags if (IS_ERR(shm_mnt)) return (void *)shm_mnt; - if (size < 0 || size > SHMEM_MAX_BYTES) + if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); if (shmem_acct_size(flags, size)) @@ -2791,5 +2537,45 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } + +/** + * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", + * with any new page allocations done using the specified allocation flags. + * But read_cache_page_gfp() uses the ->readpage() method: which does not + * suit tmpfs, since it may have pages in swapcache, and needs to find those + * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. + * + * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in + * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. + */ +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ +#ifdef CONFIG_SHMEM + struct inode *inode = mapping->host; + struct page *page; + int error; + + BUG_ON(mapping->a_ops != &shmem_aops); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + if (error) + page = ERR_PTR(error); + else + unlock_page(page); + return page; +#else + /* + * The tiny !SHMEM case uses ramfs without swap + */ + return read_cache_page_gfp(mapping, index, gfp); +#endif +} +EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); diff --git a/mm/slab.c b/mm/slab.c index 37961d1f584..6d90a091fdc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -115,6 +115,7 @@ #include <linux/debugobjects.h> #include <linux/kmemcheck.h> #include <linux/memory.h> +#include <linux/prefetch.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -191,22 +192,6 @@ typedef unsigned int kmem_bufctl_t; #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) /* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. - */ -struct slab { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; -}; - -/* * struct slab_rcu * * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to @@ -219,8 +204,6 @@ struct slab { * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. - * - * We assume struct slab_rcu can overlay struct slab when destroying. */ struct slab_rcu { struct rcu_head head; @@ -229,6 +212,27 @@ struct slab_rcu { }; /* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + union { + struct { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; + unsigned short nodeid; + }; + struct slab_rcu __slab_cover_slab_rcu; + }; +}; + +/* * struct array_cache * * Purpose: @@ -570,7 +574,9 @@ static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ +static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; static struct kmem_cache cache_cache = { + .nodelists = cache_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -616,6 +622,51 @@ int slab_is_available(void) static struct lock_class_key on_slab_l3_key; static struct lock_class_key on_slab_alc_key; +static struct lock_class_key debugobj_l3_key; +static struct lock_class_key debugobj_alc_key; + +static void slab_set_lock_classes(struct kmem_cache *cachep, + struct lock_class_key *l3_key, struct lock_class_key *alc_key, + int q) +{ + struct array_cache **alc; + struct kmem_list3 *l3; + int r; + + l3 = cachep->nodelists[q]; + if (!l3) + return; + + lockdep_set_class(&l3->list_lock, l3_key); + alc = l3->alien; + /* + * FIXME: This check for BAD_ALIEN_MAGIC + * should go away when common slab code is taught to + * work even without alien caches. + * Currently, non NUMA code returns BAD_ALIEN_MAGIC + * for alloc_alien_cache, + */ + if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) + return; + for_each_node(r) { + if (alc[r]) + lockdep_set_class(&alc[r]->lock, alc_key); + } +} + +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +{ + slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); +} + +static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) +{ + int node; + + for_each_online_node(node) + slab_set_debugobj_lock_classes_node(cachep, node); +} + static void init_node_lock_keys(int q) { struct cache_sizes *s = malloc_sizes; @@ -624,29 +675,14 @@ static void init_node_lock_keys(int q) return; for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { - struct array_cache **alc; struct kmem_list3 *l3; - int r; l3 = s->cs_cachep->nodelists[q]; if (!l3 || OFF_SLAB(s->cs_cachep)) continue; - lockdep_set_class(&l3->list_lock, &on_slab_l3_key); - alc = l3->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - continue; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, - &on_slab_alc_key); - } + + slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, + &on_slab_alc_key, q); } } @@ -665,6 +701,14 @@ static void init_node_lock_keys(int q) static inline void init_lock_keys(void) { } + +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + +static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) +{ +} #endif /* @@ -875,7 +919,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transfered to another + * However, when such objects are allocated or transferred to another * cache the pointers are not cleared and they could be counted as * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. @@ -1258,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu) spin_unlock_irq(&l3->list_lock); kfree(shared); free_alien_cache(alien); + if (cachep->flags & SLAB_DEBUG_OBJECTS) + slab_set_debugobj_lock_classes_node(cachep, node); } init_node_lock_keys(node); @@ -1387,7 +1433,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self, break; } out: - return ret ? notifier_from_errno(ret) : NOTIFY_OK; + return notifier_from_errno(ret); } #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ @@ -1488,11 +1534,10 @@ void __init kmem_cache_init(void) cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* - * struct kmem_cache size depends on nr_node_ids, which - * can be less than MAX_NUMNODES. + * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + - nr_node_ids * sizeof(struct kmem_list3 *); + cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + + nr_node_ids * sizeof(struct kmem_list3 *); #if DEBUG cache_cache.obj_size = cache_cache.buffer_size; #endif @@ -1621,6 +1666,9 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* 6) resize the head arrays to their final sizes */ mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) @@ -1631,9 +1679,6 @@ void __init kmem_cache_init_late(void) /* Done! */ g_cpucache_up = FULL; - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* * Register a cpu startup notifier callback that initializes * cpu_cache_get for all new cpus @@ -2147,8 +2192,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. - * Note that kmem_cache_name() is not guaranteed to return the same pointer, - * therefore applications must manage it themselves. * * The flags are * @@ -2288,8 +2331,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign < align) { ralign = align; } - /* disable debug if not aligning with REDZONE_ALIGN */ - if (ralign & (__alignof__(unsigned long long) - 1)) + /* disable debug if necessary */ + if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. @@ -2306,6 +2349,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (!cachep) goto oops; + cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; #if DEBUG cachep->obj_size = size; @@ -2315,8 +2359,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += align; - size += align + sizeof(unsigned long long); + cachep->obj_offset += sizeof(unsigned long long); + size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of @@ -2422,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align, goto oops; } + if (flags & SLAB_DEBUG_OBJECTS) { + /* + * Would deadlock through slab_destroy()->call_rcu()-> + * debug_object_activate()->kmem_cache_alloc(). + */ + WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); + + slab_set_debugobj_lock_classes(cachep); + } + /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); oops: @@ -2605,7 +2659,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); * * The cache must be empty before calling this function. * - * The caller must guarantee that noone will allocate memory from the cache + * The caller must guarantee that no one will allocate memory from the cache * during the kmem_cache_destroy(). */ void kmem_cache_destroy(struct kmem_cache *cachep) @@ -3151,12 +3205,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); -#if ARCH_SLAB_MINALIGN - if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { + if (ARCH_SLAB_MINALIGN && + ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", - objp, ARCH_SLAB_MINALIGN); + objp, (int)ARCH_SLAB_MINALIGN); } -#endif return objp; } #else @@ -3400,7 +3453,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); - if (nodeid == -1) + if (nodeid == NUMA_NO_NODE) nodeid = slab_node; if (unlikely(!cachep->nodelists[nodeid])) { @@ -3602,13 +3655,14 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp) +static inline void __cache_free(struct kmem_cache *cachep, void *objp, + void *caller) { struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); - objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + objp = cache_free_debugcheck(cachep, objp, caller); kmemcheck_slab_free(cachep, objp, obj_size(cachep)); @@ -3799,7 +3853,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) debug_check_no_locks_freed(objp, obj_size(cachep)); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(objp, obj_size(cachep)); - __cache_free(cachep, objp); + __cache_free(cachep, objp, __builtin_return_address(0)); local_irq_restore(flags); trace_kmem_cache_free(_RET_IP_, objp); @@ -3829,7 +3883,7 @@ void kfree(const void *objp) c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); debug_check_no_obj_freed(objp, obj_size(c)); - __cache_free(c, (void *)objp); + __cache_free(c, (void *)objp, __builtin_return_address(0)); local_irq_restore(flags); } EXPORT_SYMBOL(kfree); @@ -3840,12 +3894,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *cachep) -{ - return cachep->name; -} -EXPORT_SYMBOL_GPL(kmem_cache_name); - /* * This initializes kmem_list3 or resizes various caches for all nodes. */ @@ -3936,7 +3984,7 @@ fail: struct ccupdate_struct { struct kmem_cache *cachep; - struct array_cache *new[NR_CPUS]; + struct array_cache *new[0]; }; static void do_ccupdate_local(void *info) @@ -3958,7 +4006,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, struct ccupdate_struct *new; int i; - new = kzalloc(sizeof(*new), gfp); + new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), + gfp); if (!new) return -ENOMEM; diff --git a/mm/slob.c b/mm/slob.c index 3588eaaef72..bf391818716 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -70,7 +70,7 @@ #include <trace/events/kmem.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * slob_block has a field 'units', which indicates size of block if +ve, @@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; + gfp &= gfp_allowed_mask; + lockdep_trace_alloc(gfp); if (size < PAGE_SIZE - align) { @@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; + flags &= gfp_allowed_mask; + + lockdep_trace_alloc(flags); + if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, @@ -666,12 +672,6 @@ unsigned int kmem_cache_size(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *c) -{ - return c->name; -} -EXPORT_SYMBOL(kmem_cache_name); - int kmem_cache_shrink(struct kmem_cache *d) { return 0; diff --git a/mm/slub.c b/mm/slub.c index e15aa7f193c..9f662d70eb4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2,10 +2,11 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks and only - * uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using per slab locks or atomic operatios + * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter */ #include <linux/mm.h> @@ -27,20 +28,33 @@ #include <linux/memory.h> #include <linux/math64.h> #include <linux/fault-inject.h> +#include <linux/stacktrace.h> #include <trace/events/kmem.h> /* * Lock order: - * 1. slab_lock(page) - * 2. slab->list_lock + * 1. slub_lock (Global Semaphore) + * 2. node->list_lock + * 3. slab_lock(page) (Only on some arches and for debugging) * - * The slab_lock protects operations on the object of a particular - * slab and its metadata in the page struct. If the slab lock - * has been taken then no allocations nor frees can be performed - * on the objects in the slab nor can the slab be added or removed - * from the partial or full lists since this would mean modifying - * the page_struct of the slab. + * slub_lock + * + * The role of the slub_lock is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. + * + * The slab_lock is only used for debugging and on arches that do not + * have the ability to do a cmpxchg_double. It only protects the second + * double word in the page struct. Meaning + * A. page->freelist -> List of object free in a page + * B. page->counters -> Counters of objects + * C. page->frozen -> frozen state + * + * If a slab is frozen then it is exempt from list management. It is not + * on any list. The processor that froze the slab is the one who can + * perform list operations on the page. Other processors may put objects + * onto the freelist but the processor that froze the slab is the only + * one that can retrieve the objects from the page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -53,20 +67,6 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * - * The lock order is sometimes inverted when we are trying to get a slab - * off a list. We take the list_lock and then look for a page on the list - * to use. While we do that objects in the slabs may be freed. We can - * only operate on the slab if we have also taken the slab_lock. So we use - * a slab_trylock() on the slab. If trylock was successful then no frees - * can occur anymore and we can use the slab for allocations etc. If the - * slab_trylock() does not succeed then frees are in progress in the slab and - * we must stay away from it for a while since we may cause a bouncing - * cacheline if we try to acquire the lock. So go onto the next slab. - * If all pages are busy then we may allocate a new slab instead of reusing - * a partial slab. A new slab has noone operating on it and thus there is - * no danger of cacheline contention. - * * Interrupts are disabled during allocation and deallocation in order to * make the slab allocator safe to use in the context of an irq. In addition * interrupts are disabled to ensure that the processor does not change @@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST +/* Enable to log cmpxchg failures */ +#undef SLUB_DEBUG_CMPXCHG + /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -166,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) -#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ +#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ +#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ static int kmem_size = sizeof(struct kmem_cache); @@ -191,8 +195,12 @@ static LIST_HEAD(slab_caches); /* * Tracking user of a slab. */ +#define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -217,7 +225,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) #endif -static inline void stat(struct kmem_cache *s, enum stat_item si) +static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS __this_cpu_inc(s->cpu_slab->stat[si]); @@ -261,6 +269,18 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return *(void **)(object + s->offset); } +static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) +{ + void *p; + +#ifdef CONFIG_DEBUG_PAGEALLOC + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); +#else + p = get_freepointer(s, object); +#endif + return p; +} + static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { *(void **)(object + s->offset) = fp; @@ -271,21 +291,46 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) -/* Scan freelist */ -#define for_each_free_object(__p, __s, __free) \ - for (__p = (__free); __p; __p = get_freepointer((__s), __p)) - /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { return (p - addr) / s->size; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + +static inline int order_objects(int order, unsigned long size, int reserved) +{ + return ((PAGE_SIZE << order) - reserved) / size; +} + static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size) + unsigned long size, int reserved) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + order_objects(order, size, reserved) }; return x; @@ -301,8 +346,111 @@ static inline int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + __bit_spin_unlock(PG_locked, &page->flags); +} + +/* Interrupts must be disabled (for the fallback code to work right) */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + VM_BUG_ON(!irqs_disabled()); +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + slab_lock(page); + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + slab_unlock(page); + return 1; + } + slab_unlock(page); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + unsigned long flags; + + local_irq_save(flags); + slab_lock(page); + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + slab_unlock(page); + local_irq_restore(flags); + return 1; + } + slab_unlock(page); + local_irq_restore(flags); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + #ifdef CONFIG_SLUB_DEBUG /* + * Determine a map of object in use on a page. + * + * Node listlock must be held to guarantee that the page does + * not vanish from under us. + */ +static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) +{ + void *p; + void *addr = page_address(page); + + for (p = page->freelist; p; p = get_freepointer(s, p)) + set_bit(slab_index(p, s, addr), map); +} + +/* * Debug settings: */ #ifdef CONFIG_SLUB_DEBUG_ON @@ -368,6 +516,24 @@ static void set_track(struct kmem_cache *s, void *object, struct track *p = get_track(s, object, alloc); if (addr) { +#ifdef CONFIG_STACKTRACE + struct stack_trace trace; + int i; + + trace.nr_entries = 0; + trace.max_entries = TRACK_ADDRS_COUNT; + trace.entries = p->addrs; + trace.skip = 3; + save_stack_trace(&trace); + + /* See rant in lockdep.c */ + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries - 1] == ULONG_MAX) + trace.nr_entries--; + + for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) + p->addrs[i] = 0; +#endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; @@ -392,6 +558,16 @@ static void print_track(const char *s, struct track *t) printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKTRACE + { + int i; + for (i = 0; i < TRACK_ADDRS_COUNT; i++) + if (t->addrs[i]) + printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + else + break; + } +#endif } static void print_tracking(struct kmem_cache *s, void *object) @@ -505,10 +681,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) memset(p + s->objsize, val, s->inuse - s->objsize); } -static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) +static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) { while (bytes) { - if (*start != (u8)value) + if (*start != value) return start; start++; bytes--; @@ -516,6 +692,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) return NULL; } +static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) +{ + u64 value64; + unsigned int words, prefix; + + if (bytes <= 16) + return check_bytes8(start, value, bytes); + + value64 = value | value << 8 | value << 16 | value << 24; + value64 = (value64 & 0xffffffff) | value64 << 32; + prefix = 8 - ((unsigned long)start) % 8; + + if (prefix) { + u8 *r = check_bytes8(start, value, prefix); + if (r) + return r; + start += prefix; + bytes -= prefix; + } + + words = bytes / 8; + + while (words) { + if (*(u64 *)start != value64) + return check_bytes8(start, value, 8); + start += 8; + words--; + } + + return check_bytes8(start, value, bytes % 8); +} + static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { @@ -617,7 +825,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)); + length = (PAGE_SIZE << compound_order(page)) - s->reserved; end = start + length; remainder = length % s->size; if (!remainder) @@ -698,7 +906,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", s->name, page->objects, maxobj); @@ -721,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) static int on_freelist(struct kmem_cache *s, struct page *page, void *search) { int nr = 0; - void *fp = page->freelist; + void *fp; void *object = NULL; unsigned long max_objects; + fp = page->freelist; while (fp && nr <= page->objects) { if (fp == search) return 1; @@ -748,7 +957,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + max_objects = order_objects(compound_order(page), s->size, s->reserved); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -800,45 +1009,56 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) { flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, s->objsize); + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) -{ - kmemcheck_slab_free(s, object, s->objsize); - debug_check_no_locks_freed(object, s->objsize); + /* + * Trouble is that we may no longer disable interupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->objsize); + debug_check_no_locks_freed(x, s->objsize); + local_irq_restore(flags); + } +#endif if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + debug_check_no_obj_freed(x, s->objsize); } /* * Tracking of fully allocated slabs for debugging purposes. + * + * list_lock must be held. */ -static void add_full(struct kmem_cache_node *n, struct page *page) +static void add_full(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { - spin_lock(&n->list_lock); + if (!(s->flags & SLAB_STORE_USER)) + return; + list_add(&page->lru, &n->full); - spin_unlock(&n->list_lock); } +/* + * list_lock must be held. + */ static void remove_full(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n; - if (!(s->flags & SLAB_STORE_USER)) return; - n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); list_del(&page->lru); - spin_unlock(&n->list_lock); } /* Tracking of the number of slabs for debugging purposes */ @@ -894,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa if (!check_slab(s, page)) goto bad; - if (!on_freelist(s, page, object)) { - object_err(s, page, object, "Object already allocated"); - goto bad; - } - if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); goto bad; @@ -931,6 +1146,12 @@ bad: static noinline int free_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { + unsigned long flags; + int rc = 0; + + local_irq_save(flags); + slab_lock(page); + if (!check_slab(s, page)) goto fail; @@ -945,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - return 0; + goto out; if (unlikely(s != page->slab)) { if (!PageSlab(page)) { @@ -962,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, goto fail; } - /* Special debug activities for freeing objects */ - if (!PageSlubFrozen(page) && !page->freelist) - remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); - return 1; + rc = 1; +out: + slab_unlock(page); + local_irq_restore(flags); + return rc; fail: slab_fix(s, "Object at 0x%p not freed", object); - return 0; + goto out; } static int __init setup_slub_debug(char *str) @@ -1073,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } -static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, void (*ctor)(void *)) @@ -1101,9 +1325,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, static inline void slab_free_hook(struct kmem_cache *s, void *x) {} -static inline void slab_free_hook_irq(struct kmem_cache *s, - void *object) {} - #endif /* CONFIG_SLUB_DEBUG */ /* @@ -1128,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + flags &= gfp_allowed_mask; + + if (flags & __GFP_WAIT) + local_irq_enable(); + flags |= s->allocflags; /* @@ -1144,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Try a lower order alloc if possible */ page = alloc_slab_page(flags, node, oo); - if (!page) - return NULL; - stat(s, ORDER_FALLBACK); + if (page) + stat(s, ORDER_FALLBACK); } + if (flags & __GFP_WAIT) + local_irq_disable(); + + if (!page) + return NULL; + if (kmemcheck_enabled && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); @@ -1217,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->freelist = start; page->inuse = 0; + page->frozen = 1; out: return page; } @@ -1249,21 +1481,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } +#define need_reserve_slab_rcu \ + (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) + static void rcu_free_slab(struct rcu_head *h) { struct page *page; - page = container_of((struct list_head *)h, struct page, lru); + if (need_reserve_slab_rcu) + page = virt_to_head_page(h); + else + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); } static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - /* - * RCU free overloads the RCU head over the LRU - */ - struct rcu_head *head = (void *)&page->lru; + struct rcu_head *head; + + if (need_reserve_slab_rcu) { + int order = compound_order(page); + int offset = (PAGE_SIZE << order) - s->reserved; + + VM_BUG_ON(s->reserved != sizeof(*head)); + head = page_address(page) + offset; + } else { + /* + * RCU free overloads the RCU head over the LRU + */ + head = (void *)&page->lru; + } call_rcu(head, rcu_free_slab); } else @@ -1277,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page) } /* - * Per slab locking using the pagelock - */ -static __always_inline void slab_lock(struct page *page) -{ - bit_spin_lock(PG_locked, &page->flags); -} - -static __always_inline void slab_unlock(struct page *page) -{ - __bit_spin_unlock(PG_locked, &page->flags); -} - -static __always_inline int slab_trylock(struct page *page) -{ - int rc = 1; - - rc = bit_spin_trylock(PG_locked, &page->flags); - return rc; -} - -/* - * Management of partially allocated slabs + * Management of partially allocated slabs. + * + * list_lock must be held. */ -static void add_partial(struct kmem_cache_node *n, +static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { - spin_lock(&n->list_lock); n->nr_partial++; if (tail) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); - spin_unlock(&n->list_lock); } -static inline void __remove_partial(struct kmem_cache_node *n, +/* + * list_lock must be held. + */ +static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { list_del(&page->lru); n->nr_partial--; } -static void remove_partial(struct kmem_cache *s, struct page *page) -{ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); - __remove_partial(n, page); - spin_unlock(&n->list_lock); -} - /* - * Lock slab and remove from the partial list. + * Lock slab, remove from the partial list and put the object into the + * per cpu freelist. * * Must hold list_lock. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, - struct page *page) +static inline int acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { - if (slab_trylock(page)) { - __remove_partial(n, page); - __SetPageSlubFrozen(page); + void *freelist; + unsigned long counters; + struct page new; + + /* + * Zap the freelist and set the frozen bit. + * The old freelist is the list of objects for the + * per cpu allocation list. + */ + do { + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + new.inuse = page->objects; + + VM_BUG_ON(new.frozen); + new.frozen = 1; + + } while (!__cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "lock and freeze")); + + remove_partial(n, page); + + if (freelist) { + /* Populate the per cpu freelist */ + this_cpu_write(s->cpu_slab->freelist, freelist); + this_cpu_write(s->cpu_slab->page, page); + this_cpu_write(s->cpu_slab->node, page_to_nid(page)); return 1; + } else { + /* + * Slab page came from the wrong list. No object to allocate + * from. Put it onto the correct list and continue partial + * scan. + */ + printk(KERN_ERR "SLUB: %s : Page without available objects on" + " partial list\n", s->name); + return 0; } - return 0; } /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache_node *n) +static struct page *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n) { struct page *page; @@ -1362,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n) spin_lock(&n->list_lock); list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(n, page)) + if (acquire_slab(s, n, page)) goto out; page = NULL; out: @@ -1413,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - page = get_partial_node(n); + page = get_partial_node(s, n); if (page) { put_mems_allowed(); return page; @@ -1433,98 +1692,237 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) struct page *page; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(get_node(s, searchnode)); - if (page || node != -1) + page = get_partial_node(s, get_node(s, searchnode)); + if (page || node != NUMA_NO_NODE) return page; return get_any_partial(s, flags); } +#ifdef CONFIG_PREEMPT /* - * Move a page back to the lists. - * - * Must be called with the slab lock held. - * - * On exit the slab lock will have been dropped. + * Calculate the next globally unique transaction for disambiguiation + * during cmpxchg. The transactions start with the cpu number and are then + * incremented by CONFIG_NR_CPUS. + */ +#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) +#else +/* + * No preemption supported therefore also no need to check for + * different cpus. */ -static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) - __releases(bitlock) +#define TID_STEP 1 +#endif + +static inline unsigned long next_tid(unsigned long tid) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + return tid + TID_STEP; +} - __ClearPageSlubFrozen(page); - if (page->inuse) { +static inline unsigned int tid_to_cpu(unsigned long tid) +{ + return tid % TID_STEP; +} - if (page->freelist) { - add_partial(n, page, tail); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); - } else { - stat(s, DEACTIVATE_FULL); - if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) - add_full(n, page); - } - slab_unlock(page); - } else { - stat(s, DEACTIVATE_EMPTY); - if (n->nr_partial < s->min_partial) { - /* - * Adding an empty slab to the partial slabs in order - * to avoid page allocator overhead. This slab needs - * to come after the other slabs with objects in - * so that the others get filled first. That way the - * size of the partial list stays small. - * - * kmem_cache_shrink can reclaim any empty slabs from - * the partial list. - */ - add_partial(n, page, 1); - slab_unlock(page); - } else { - slab_unlock(page); - stat(s, FREE_SLAB); - discard_slab(s, page); - } - } +static inline unsigned long tid_to_event(unsigned long tid) +{ + return tid / TID_STEP; } +static inline unsigned int init_tid(int cpu) +{ + return cpu; +} + +static inline void note_cmpxchg_failure(const char *n, + const struct kmem_cache *s, unsigned long tid) +{ +#ifdef SLUB_DEBUG_CMPXCHG + unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); + + printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + +#ifdef CONFIG_PREEMPT + if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + printk("due to cpu change %d -> %d\n", + tid_to_cpu(tid), tid_to_cpu(actual_tid)); + else +#endif + if (tid_to_event(tid) != tid_to_event(actual_tid)) + printk("due to cpu running other code. Event %ld->%ld\n", + tid_to_event(tid), tid_to_event(actual_tid)); + else + printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + actual_tid, tid, next_tid(tid)); +#endif + stat(s, CMPXCHG_DOUBLE_CPU_FAIL); +} + +void init_kmem_cache_cpus(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); +} +/* + * Remove the cpu slab + */ + /* * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) - __releases(bitlock) { + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct page *page = c->page; - int tail = 1; - - if (page->freelist) + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + int lock = 0; + enum slab_modes l = M_NONE, m = M_NONE; + void *freelist; + void *nextfree; + int tail = 0; + struct page new; + struct page old; + + if (page->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); + tail = 1; + } + + c->tid = next_tid(c->tid); + c->page = NULL; + freelist = c->freelist; + c->freelist = NULL; + /* - * Merge cpu freelist into slab freelist. Typically we get here - * because both freelists are empty. So this is unlikely - * to occur. + * Stage one: Free all available per cpu objects back + * to the page freelist while it is still frozen. Leave the + * last one. + * + * There is no need to take the list->lock because the page + * is still frozen. */ - while (unlikely(c->freelist)) { - void **object; + while (freelist && (nextfree = get_freepointer(s, freelist))) { + void *prior; + unsigned long counters; + + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, freelist, prior); + new.counters = counters; + new.inuse--; + VM_BUG_ON(!new.frozen); + + } while (!__cmpxchg_double_slab(s, page, + prior, counters, + freelist, new.counters, + "drain percpu freelist")); + + freelist = nextfree; + } - tail = 0; /* Hot objects. Put the slab first */ + /* + * Stage two: Ensure that the page is unfrozen while the + * list presence reflects the actual number of objects + * during unfreeze. + * + * We setup the list membership and then perform a cmpxchg + * with the count. If there is a mismatch then the page + * is not unfrozen but the page is on the wrong list. + * + * Then we restart the process which may have to remove + * the page from the list that we just put it on again + * because the number of objects in the slab may have + * changed. + */ +redo: + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + /* Determine target state of the slab */ + new.counters = old.counters; + if (freelist) { + new.inuse--; + set_freepointer(s, freelist, old.freelist); + new.freelist = freelist; + } else + new.freelist = old.freelist; - /* Retrieve object from cpu_freelist */ - object = c->freelist; - c->freelist = get_freepointer(s, c->freelist); + new.frozen = 0; - /* And put onto the regular freelist */ - set_freepointer(s, object, page->freelist); - page->freelist = object; - page->inuse--; + if (!new.inuse && n->nr_partial > s->min_partial) + m = M_FREE; + else if (new.freelist) { + m = M_PARTIAL; + if (!lock) { + lock = 1; + /* + * Taking the spinlock removes the possiblity + * that acquire_slab() will see a slab page that + * is frozen + */ + spin_lock(&n->list_lock); + } + } else { + m = M_FULL; + if (kmem_cache_debug(s) && !lock) { + lock = 1; + /* + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. + */ + spin_lock(&n->list_lock); + } + } + + if (l != m) { + + if (l == M_PARTIAL) + + remove_partial(n, page); + + else if (l == M_FULL) + + remove_full(s, page); + + if (m == M_PARTIAL) { + + add_partial(n, page, tail); + stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + + } else if (m == M_FULL) { + + stat(s, DEACTIVATE_FULL); + add_full(s, n, page); + + } + } + + l = m; + if (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) + goto redo; + + if (lock) + spin_unlock(&n->list_lock); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); } - c->page = NULL; - unfreeze_slab(s, page, tail); } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - slab_lock(c->page); deactivate_slab(s, c); } @@ -1651,77 +2049,124 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void **object; - struct page *new; + struct page *page; + unsigned long flags; + struct page new; + unsigned long counters; + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif /* We handle __GFP_ZERO in the caller */ gfpflags &= ~__GFP_ZERO; - if (!c->page) + page = c->page; + if (!page) goto new_slab; - slab_lock(c->page); - if (unlikely(!node_match(c, node))) - goto another_slab; + if (unlikely(!node_match(c, node))) { + stat(s, ALLOC_NODE_MISMATCH); + deactivate_slab(s, c); + goto new_slab; + } + + stat(s, ALLOC_SLOWPATH); + + do { + object = page->freelist; + counters = page->counters; + new.counters = counters; + VM_BUG_ON(!new.frozen); + + /* + * If there is no object left then we use this loop to + * deactivate the slab which is simple since no objects + * are left in the slab and therefore we do not need to + * put the page back onto the partial list. + * + * If there are objects left then we retrieve them + * and use them to refill the per cpu queue. + */ + + new.inuse = page->objects; + new.frozen = object != NULL; + + } while (!__cmpxchg_double_slab(s, page, + object, counters, + NULL, new.counters, + "__slab_alloc")); + + if (unlikely(!object)) { + c->page = NULL; + stat(s, DEACTIVATE_BYPASS); + goto new_slab; + } stat(s, ALLOC_REFILL); load_freelist: - object = c->page->freelist; - if (unlikely(!object)) - goto another_slab; - if (kmem_cache_debug(s)) - goto debug; - + VM_BUG_ON(!page->frozen); c->freelist = get_freepointer(s, object); - c->page->inuse = c->page->objects; - c->page->freelist = NULL; - c->node = page_to_nid(c->page); -unlock_out: - slab_unlock(c->page); - stat(s, ALLOC_SLOWPATH); + c->tid = next_tid(c->tid); + local_irq_restore(flags); return object; -another_slab: - deactivate_slab(s, c); - new_slab: - new = get_partial(s, gfpflags, node); - if (new) { - c->page = new; + page = get_partial(s, gfpflags, node); + if (page) { stat(s, ALLOC_FROM_PARTIAL); + object = c->freelist; + + if (kmem_cache_debug(s)) + goto debug; goto load_freelist; } - gfpflags &= gfp_allowed_mask; - if (gfpflags & __GFP_WAIT) - local_irq_enable(); - - new = new_slab(s, gfpflags, node); - - if (gfpflags & __GFP_WAIT) - local_irq_disable(); + page = new_slab(s, gfpflags, node); - if (new) { + if (page) { c = __this_cpu_ptr(s->cpu_slab); - stat(s, ALLOC_SLAB); if (c->page) flush_slab(s, c); - slab_lock(new); - __SetPageSlubFrozen(new); - c->page = new; + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + object = page->freelist; + page->freelist = NULL; + page->inuse = page->objects; + + stat(s, ALLOC_SLAB); + c->node = page_to_nid(page); + c->page = page; + + if (kmem_cache_debug(s)) + goto debug; goto load_freelist; } if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) slab_out_of_memory(s, gfpflags, node); + local_irq_restore(flags); return NULL; + debug: - if (!alloc_debug_processing(s, c->page, object, addr)) - goto another_slab; + if (!object || !alloc_debug_processing(s, page, object, addr)) + goto new_slab; - c->page->inuse++; - c->page->freelist = get_freepointer(s, object); + c->freelist = get_freepointer(s, object); + deactivate_slab(s, c); + c->page = NULL; c->node = NUMA_NO_NODE; - goto unlock_out; + local_irq_restore(flags); + return object; } /* @@ -1739,23 +2184,58 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, { void **object; struct kmem_cache_cpu *c; - unsigned long flags; + unsigned long tid; if (slab_pre_alloc_hook(s, gfpflags)) return NULL; - local_irq_save(flags); +redo: + + /* + * Must read kmem_cache cpu data via this cpu ptr. Preemption is + * enabled. We may switch back and forth between cpus while + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + */ c = __this_cpu_ptr(s->cpu_slab); + + /* + * The transaction ids are globally unique per cpu and per operation on + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double + * occurs on the right processor and that there was no operation on the + * linked list in between. + */ + tid = c->tid; + barrier(); + object = c->freelist; if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { - c->freelist = get_freepointer(s, object); + /* + * The cmpxchg will only match if there was no additional + * operation and if we are on the right processor. + * + * The cmpxchg does the following atomically (without lock semantics!) + * 1. Relocate first pointer to the current per cpu area. + * 2. Verify that tid and freelist have not been changed + * 3. If they were not changed replace tid and freelist + * + * Since this is without lock semantics the protection is only against + * code executing on this cpu *not* from access by other cpus. + */ + if (unlikely(!irqsafe_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + object, tid, + get_freepointer_safe(s, object), next_tid(tid)))) { + + note_cmpxchg_failure("slab_alloc", s, tid); + goto redo; + } stat(s, ALLOC_FASTPATH); } - local_irq_restore(flags); if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->objsize); @@ -1833,57 +2313,91 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; + int was_frozen; + int inuse; + struct page new; + unsigned long counters; + struct kmem_cache_node *n = NULL; + unsigned long uninitialized_var(flags); stat(s, FREE_SLOWPATH); - slab_lock(page); - if (kmem_cache_debug(s)) - goto debug; + if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) + return; -checks_ok: - prior = page->freelist; - set_freepointer(s, object, prior); - page->freelist = object; - page->inuse--; + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, object, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse--; + if ((!new.inuse || !prior) && !was_frozen && !n) { + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + } + inuse = new.inuse; - if (unlikely(PageSlubFrozen(page))) { - stat(s, FREE_FROZEN); - goto out_unlock; - } + } while (!cmpxchg_double_slab(s, page, + prior, counters, + object, new.counters, + "__slab_free")); - if (unlikely(!page->inuse)) - goto slab_empty; + if (likely(!n)) { + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + if (was_frozen) + stat(s, FREE_FROZEN); + return; + } /* - * Objects left in the slab. If it was not on the partial list before - * then add it. + * was_frozen may have been set after we acquired the list_lock in + * an earlier loop. So we need to check it here again. */ - if (unlikely(!prior)) { - add_partial(get_node(s, page_to_nid(page)), page, 1); - stat(s, FREE_ADD_PARTIAL); - } + if (was_frozen) + stat(s, FREE_FROZEN); + else { + if (unlikely(!inuse && n->nr_partial > s->min_partial)) + goto slab_empty; -out_unlock: - slab_unlock(page); + /* + * Objects left in the slab. If it was not on the partial list before + * then add it. + */ + if (unlikely(!prior)) { + remove_full(s, page); + add_partial(n, page, 0); + stat(s, FREE_ADD_PARTIAL); + } + } + spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: if (prior) { /* - * Slab still on the partial list. + * Slab on the partial list. */ - remove_partial(s, page); + remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); - } - slab_unlock(page); + } else + /* Slab must be on the full list */ + remove_full(s, page); + + spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, page); - return; - -debug: - if (!free_debug_processing(s, page, x, addr)) - goto out_unlock; - goto checks_ok; } /* @@ -1902,23 +2416,38 @@ static __always_inline void slab_free(struct kmem_cache *s, { void **object = (void *)x; struct kmem_cache_cpu *c; - unsigned long flags; + unsigned long tid; slab_free_hook(s, x); - local_irq_save(flags); +redo: + + /* + * Determine the currently cpus per cpu slab. + * The cpu may change afterward. However that does not matter since + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succedd. + */ c = __this_cpu_ptr(s->cpu_slab); - slab_free_hook_irq(s, x); + tid = c->tid; + barrier(); - if (likely(page == c->page && c->node != NUMA_NO_NODE)) { + if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); - c->freelist = object; + + if (unlikely(!irqsafe_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + c->freelist, tid, + object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } stat(s, FREE_FASTPATH); } else __slab_free(s, page, x, addr); - local_irq_restore(flags); } void kmem_cache_free(struct kmem_cache *s, void *x) @@ -1988,13 +2517,13 @@ static int slub_nomerge; * the smallest order which will fit the object. */ static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover) + int max_order, int fract_leftover, int reserved) { int order; int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, @@ -2003,10 +2532,10 @@ static inline int slab_order(int size, int min_objects, unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size) + if (slab_size < min_objects * size + reserved) continue; - rem = slab_size % size; + rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; @@ -2016,7 +2545,7 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size) +static inline int calculate_order(int size, int reserved) { int order; int min_objects; @@ -2034,14 +2563,14 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = (PAGE_SIZE << slub_max_order)/size; + max_objects = order_objects(slub_max_order, size, reserved); min_objects = min(min_objects, max_objects); while (min_objects > 1) { fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction); + slub_max_order, fraction, reserved); if (order <= slub_max_order) return order; fraction /= 2; @@ -2053,14 +2582,14 @@ static inline int calculate_order(int size) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1); + order = slab_order(size, 1, slub_max_order, 1, reserved); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1); + order = slab_order(size, 1, MAX_ORDER, 1, reserved); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -2110,9 +2639,19 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); - s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); + /* + * Must align to double word boundary for the double cmpxchg + * instructions to work; see __pcpu_double_call_return_bool(). + */ + s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), + 2 * sizeof(void *)); + + if (!s->cpu_slab) + return 0; + + init_kmem_cache_cpus(s); - return s->cpu_slab != NULL; + return 1; } static struct kmem_cache *kmem_cache_node; @@ -2130,7 +2669,6 @@ static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; - unsigned long flags; BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); @@ -2148,6 +2686,7 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse++; + page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); @@ -2156,14 +2695,7 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n, kmem_cache_node); inc_slabs_node(kmem_cache_node, node, page->objects); - /* - * lockdep requires consistent irq usage for each lock - * so even though there cannot be a race this early in - * the boot sequence, we still disable irqs. - */ - local_irq_save(flags); add_partial(n, page, 0); - local_irq_restore(flags); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2311,7 +2843,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size); + order = calculate_order(size, s->reserved); if (order < 0) return 0; @@ -2329,8 +2861,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size); - s->min = oo_make(get_order(size), size); + s->oo = oo_make(order, size, s->reserved); + s->min = oo_make(get_order(size), size, s->reserved); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -2349,6 +2881,10 @@ static int kmem_cache_open(struct kmem_cache *s, s->objsize = size; s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); + s->reserved = 0; + + if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + s->reserved = sizeof(struct rcu_head); if (!calculate_sizes(s, -1)) goto error; @@ -2365,6 +2901,12 @@ static int kmem_cache_open(struct kmem_cache *s, } } +#ifdef CONFIG_CMPXCHG_DOUBLE + if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; +#endif + /* * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. @@ -2399,12 +2941,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *s) -{ - return s->name; -} -EXPORT_SYMBOL(kmem_cache_name); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { @@ -2417,9 +2953,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, return; slab_err(s, page, "%s", text); slab_lock(page); - for_each_free_object(p, s, page->freelist) - set_bit(slab_index(p, s, addr), map); + get_map(s, page, map); for_each_object(p, s, addr, page->objects) { if (!test_bit(slab_index(p, s, addr), map)) { @@ -2444,7 +2979,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - __remove_partial(n, page); + remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, @@ -2696,7 +3231,6 @@ EXPORT_SYMBOL(__kmalloc_node); size_t ksize(const void *object) { struct page *page; - struct kmem_cache *s; if (unlikely(object == ZERO_SIZE_PTR)) return 0; @@ -2707,30 +3241,46 @@ size_t ksize(const void *object) WARN_ON(!PageCompound(page)); return PAGE_SIZE << compound_order(page); } - s = page->slab; + + return slab_ksize(page->slab); +} +EXPORT_SYMBOL(ksize); #ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; +bool verify_mem_not_deleted(const void *x) +{ + struct page *page; + void *object = (void *)x; + unsigned long flags; + bool rv; -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; + if (unlikely(ZERO_OR_NULL_PTR(x))) + return false; + + local_irq_save(flags); + + page = virt_to_head_page(x); + if (unlikely(!PageSlab(page))) { + /* maybe it was from stack? */ + rv = true; + goto out_unlock; + } + + slab_lock(page); + if (on_freelist(page->slab, page, object)) { + object_err(page->slab, page, object, "Object is on free-list"); + rv = false; + } else { + rv = true; + } + slab_unlock(page); + +out_unlock: + local_irq_restore(flags); + return rv; } -EXPORT_SYMBOL(ksize); +EXPORT_SYMBOL(verify_mem_not_deleted); +#endif void kfree(const void *x) { @@ -2797,14 +3347,8 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { - /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. - */ - __remove_partial(n, page); - slab_unlock(page); + if (!page->inuse) { + remove_partial(n, page); discard_slab(s, page); } else { list_move(&page->lru, @@ -2968,7 +3512,7 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) list_for_each_entry(p, &n->partial, lru) p->slab = s; -#ifdef CONFIG_SLAB_DEBUG +#ifdef CONFIG_SLUB_DEBUG list_for_each_entry(p, &n->full, lru) p->slab = s; #endif @@ -3312,7 +3856,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); - /* Honor the call site pointer we recieved. */ + /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); return ret; @@ -3342,7 +3886,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, ret = slab_alloc(s, gfpflags, node, caller); - /* Honor the call site pointer we recieved. */ + /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); return ret; @@ -3375,10 +3919,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page, /* Now we know that a valid freelist exists */ bitmap_zero(map, page->objects); - for_each_free_object(p, s, page->freelist) { - set_bit(slab_index(p, s, addr), map); - if (!check_object(s, page, p, SLUB_RED_INACTIVE)) - return 0; + get_map(s, page, map); + for_each_object(p, s, addr, page->objects) { + if (test_bit(slab_index(p, s, addr), map)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) + return 0; } for_each_object(p, s, addr, page->objects) @@ -3391,12 +3936,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, static void validate_slab_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { - if (slab_trylock(page)) { - validate_slab(s, page, map); - slab_unlock(page); - } else - printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", - s->name, page); + slab_lock(page); + validate_slab(s, page, map); + slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, @@ -3586,8 +4128,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, void *p; bitmap_zero(map, page->objects); - for_each_free_object(p, s, page->freelist) - set_bit(slab_index(p, s, addr), map); + get_map(s, page, map); for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) @@ -3862,7 +4403,7 @@ static int any_slab_objects(struct kmem_cache *s) #endif #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) -#define to_slab(n) container_of(n, struct kmem_cache, kobj); +#define to_slab(n) container_of(n, struct kmem_cache, kobj) struct slab_attribute { struct attribute attr; @@ -4017,6 +4558,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); +static ssize_t reserved_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->reserved); +} +SLAB_ATTR_RO(reserved); + #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -4039,8 +4586,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_DEBUG_FREE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_DEBUG_FREE; + } return length; } SLAB_ATTR(sanity_checks); @@ -4054,8 +4603,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_TRACE; + } return length; } SLAB_ATTR(trace); @@ -4072,8 +4623,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; + } calculate_sizes(s, -1); return length; } @@ -4091,8 +4644,10 @@ static ssize_t poison_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_POISON; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; + } calculate_sizes(s, -1); return length; } @@ -4110,8 +4665,10 @@ static ssize_t store_user_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_STORE_USER; + } calculate_sizes(s, -1); return length; } @@ -4276,6 +4833,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); STAT_ATTR(DEACTIVATE_FULL, deactivate_full); @@ -4283,7 +4841,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); +STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); #endif static struct attribute *slab_attrs[] = { @@ -4303,6 +4864,7 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, + &reserved_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, @@ -4332,6 +4894,7 @@ static struct attribute *slab_attrs[] = { &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, &alloc_refill_attr.attr, + &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, &cpuslab_flush_attr.attr, &deactivate_full_attr.attr, @@ -4339,7 +4902,10 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &deactivate_bypass_attr.attr, &order_fallback_attr.attr, + &cmpxchg_double_fail_attr.attr, + &cmpxchg_double_cpu_fail_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, diff --git a/mm/sparse.c b/mm/sparse.c index 93250207c5c..858e1dff9b2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(struct page *page) +int page_to_nid(const struct page *page) { return section_to_node_table[page_to_section(page)]; } @@ -500,7 +500,7 @@ void __init sparse_init(void) * so alloc 2M (with 2M align) and 24 bytes in turn will * make next 2M slip to one more 2M later. * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continously. + * here try to allocate 2M pages continuously. * * powerpc need to call sparse_init_one_section right after each * sparse_early_mem_map_alloc, so allocate usemap_map at first. diff --git a/mm/swap.c b/mm/swap.c index c02f93611a8..3a442f18b0b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -39,6 +39,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -178,15 +179,13 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) +static void pagevec_lru_move_fn(struct pagevec *pvec, + void (*move_fn)(struct page *page, void *arg), + void *arg) { int i; - int pgmoved = 0; struct zone *zone = NULL; + unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -194,29 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec) if (pagezone != zone) { if (zone) - spin_unlock(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; - spin_lock(&zone->lru_lock); - } - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - pgmoved++; + spin_lock_irqsave(&zone->lru_lock, flags); } + + (*move_fn)(page, arg); } if (zone) - spin_unlock(&zone->lru_lock); - __count_vm_events(PGROTATED, pgmoved); + spin_unlock_irqrestore(&zone->lru_lock, flags); release_pages(pvec->pages, pvec->nr, pvec->cold); pagevec_reinit(pvec); } +static void pagevec_move_tail_fn(struct page *page, void *arg) +{ + int *pgmoved = arg; + struct zone *zone = page_zone(page); + + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + enum lru_list lru = page_lru_base_type(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); + (*pgmoved)++; + } +} + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int pgmoved = 0; + + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + __count_vm_events(PGROTATED, pgmoved); +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && !PageUnevictable(page) && PageLRU(page)) { @@ -252,14 +272,10 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page, memcg_reclaim_stat->recent_rotated[file]++; } -/* - * FIXME: speed this up? - */ -void activate_page(struct page *page) +static void __activate_page(struct page *page, void *arg) { struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { int file = page_is_file_cache(page); int lru = page_lru_base_type(page); @@ -272,8 +288,45 @@ void activate_page(struct page *page) update_page_reclaim_stat(zone, page, file, 1); } +} + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + +static void activate_page_drain(int cpu) +{ + struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); +} + +void activate_page(struct page *page) +{ + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); + put_cpu_var(activate_page_pvecs); + } +} + +#else +static inline void activate_page_drain(int cpu) +{ +} + +void activate_page(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + __activate_page(page, NULL); spin_unlock_irq(&zone->lru_lock); } +#endif /* * Mark a page as having seen activity. @@ -347,6 +400,74 @@ void add_page_to_unevictable_list(struct page *page) } /* + * If the page can not be invalidated, it is moved to the + * inactive list to speed up its reclaim. It is moved to the + * head of the list, rather than the tail, to give the flusher + * threads some time to write it out, as this is much more + * effective than the single-page writeout from reclaim. + * + * If the page isn't page_mapped and dirty/writeback, the page + * could reclaim asap using PG_reclaim. + * + * 1. active, mapped page -> none + * 2. active, dirty/writeback page -> inactive, head, PG_reclaim + * 3. inactive, mapped page -> none + * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim + * 5. inactive, clean -> inactive, tail + * 6. Others -> none + * + * In 4, why it moves inactive's head, the VM expects the page would + * be write it out by flusher threads as this is much more effective + * than the single-page writeout from reclaim. + */ +static void lru_deactivate_fn(struct page *page, void *arg) +{ + int lru, file; + bool active; + struct zone *zone = page_zone(page); + + if (!PageLRU(page)) + return; + + if (PageUnevictable(page)) + return; + + /* Some processes are using the page */ + if (page_mapped(page)) + return; + + active = PageActive(page); + + file = page_is_file_cache(page); + lru = page_lru_base_type(page); + del_page_from_lru_list(zone, page, lru + active); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(zone, page, lru); + + if (PageWriteback(page) || PageDirty(page)) { + /* + * PG_reclaim could be raced with end_page_writeback + * It can make readahead confusing. But race window + * is _really_ small and it's non-critical problem. + */ + SetPageReclaim(page); + } else { + /* + * The page's writeback ends up during pagevec + * We moves tha page into tail of inactive. + */ + list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); + __count_vm_event(PGROTATED); + } + + if (active) + __count_vm_event(PGDEACTIVATE); + update_page_reclaim_stat(zone, page, file, 0); +} + +/* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. @@ -372,6 +493,38 @@ static void drain_cpu_pagevecs(int cpu) pagevec_move_tail(pvec); local_irq_restore(flags); } + + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + + activate_page_drain(cpu); +} + +/** + * deactivate_page - forcefully deactivate a page + * @page: page to deactivate + * + * This function hints the VM that @page is a good reclaim candidate, + * for example if its invalidation fails due to the page being dirty + * or under writeback. + */ +void deactivate_page(struct page *page) +{ + /* + * In a workload with many unevictable page such as mprotect, unevictable + * page deactivation for accelerating reclaim is pointless. + */ + if (PageUnevictable(page)) + return; + + if (likely(get_page_unless_zero(page))) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } } void lru_add_drain(void) @@ -516,44 +669,33 @@ void lru_add_page_tail(struct zone* zone, } } +static void ____pagevec_lru_add_fn(struct page *page, void *arg) +{ + enum lru_list lru = (enum lru_list)arg; + struct zone *zone = page_zone(page); + int file = is_file_lru(lru); + int active = is_active_lru(lru); + + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + + SetPageLRU(page); + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { - int i; - struct zone *zone = NULL; - VM_BUG_ON(is_unevictable_lru(lru)); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - int file; - int active; - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - active = is_active_lru(lru); - file = is_file_lru(lru); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); + pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); } EXPORT_SYMBOL(____pagevec_lru_add); diff --git a/mm/swap_state.c b/mm/swap_state.c index 5c8cfabbc9b..46680461785 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -24,12 +24,10 @@ /* * swapper_space is a fiction, retained to simplify the path through - * vmscan's shrink_page_list, to make sync_page look nicer, and to allow - * future use of radix_tree tags in the swap cache. + * vmscan's shrink_page_list. */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .sync_page = block_sync_page, .set_page_dirty = __set_page_dirty_nobuffers, .migratepage = migrate_page, }; @@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = { static struct backing_dev_info swap_backing_dev_info = { .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, - .unplug_io_fn = swap_unplug_io_fn, }; struct address_space swapper_space = { diff --git a/mm/swapfile.c b/mm/swapfile.c index 0341c5700e3..17bc224bce6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -14,7 +14,7 @@ #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> -#include <linux/shm.h> +#include <linux/shmem_fs.h> #include <linux/blkdev.h> #include <linux/random.h> #include <linux/writeback.h> @@ -31,6 +31,7 @@ #include <linux/syscalls.h> #include <linux/memcontrol.h> #include <linux/poll.h> +#include <linux/oom.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -95,39 +96,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) } /* - * We need this because the bdev->unplug_fn can sleep and we cannot - * hold swap_lock while calling the unplug_fn. And swap_lock - * cannot be turned into a mutex. - */ -static DECLARE_RWSEM(swap_unplug_sem); - -void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) -{ - swp_entry_t entry; - - down_read(&swap_unplug_sem); - entry.val = page_private(page); - if (PageSwapCache(page)) { - struct block_device *bdev = swap_info[swp_type(entry)]->bdev; - struct backing_dev_info *bdi; - - /* - * If the page is removed from swapcache from under us (with a - * racy try_to_unuse/swapoff) we need an additional reference - * count to avoid reading garbage from page_private(page) above. - * If the WARN_ON triggers during a swapoff it maybe the race - * condition and it's harmless. However if it triggers without - * swapoff it signals a problem. - */ - WARN_ON(page_count(page) <= 1); - - bdi = bdev->bd_inode->i_mapping->backing_dev_info; - blk_run_backing_dev(bdi, page); - } - up_read(&swap_unplug_sem); -} - -/* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ @@ -212,8 +180,8 @@ static int wait_for_discard(void *word) #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 -static inline unsigned long scan_swap_map(struct swap_info_struct *si, - unsigned char usage) +static unsigned long scan_swap_map(struct swap_info_struct *si, + unsigned char usage) { unsigned long offset; unsigned long scan_base; @@ -880,7 +848,7 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { - struct mem_cgroup *ptr = NULL; + struct mem_cgroup *ptr; spinlock_t *ptl; pte_t *pte; int ret = 1; @@ -1550,6 +1518,36 @@ bad_bmap: goto out; } +static void enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map) +{ + int i, prev; + + spin_lock(&swap_lock); + if (prio >= 0) + p->prio = prio; + else + p->prio = --least_priority; + p->swap_map = swap_map; + p->flags |= SWP_WRITEOK; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) + break; + prev = i; + } + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p->type; + else + swap_info[prev]->next = p->type; + spin_unlock(&swap_lock); +} + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -1558,6 +1556,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; char *pathname; + int oom_score_adj; int i, type, prev; int err; @@ -1616,37 +1615,22 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->flags &= ~SWP_WRITEOK; spin_unlock(&swap_lock); - current->flags |= PF_OOM_ORIGIN; + oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); err = try_to_unuse(type); - current->flags &= ~PF_OOM_ORIGIN; + test_set_oom_score_adj(oom_score_adj); if (err) { + /* + * reading p->prio and p->swap_map outside the lock is + * safe here because only sys_swapon and sys_swapoff + * change them, and there can be no other sys_swapon or + * sys_swapoff for this swap_info_struct at this point. + */ /* re-insert swap space back into swap_list */ - spin_lock(&swap_lock); - if (p->prio < 0) - p->prio = --least_priority; - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = type; - else - swap_info[prev]->next = type; - nr_swap_pages += p->pages; - total_swap_pages += p->pages; - p->flags |= SWP_WRITEOK; - spin_unlock(&swap_lock); + enable_swap_info(p, p->prio, p->swap_map); goto out_dput; } - /* wait for any unplug function to finish */ - down_write(&swap_unplug_sem); - up_write(&swap_unplug_sem); - destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); @@ -1697,19 +1681,14 @@ out: } #ifdef CONFIG_PROC_FS -struct proc_swaps { - struct seq_file seq; - int event; -}; - static unsigned swaps_poll(struct file *file, poll_table *wait) { - struct proc_swaps *s = file->private_data; + struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); - if (s->event != atomic_read(&proc_poll_event)) { - s->event = atomic_read(&proc_poll_event); + if (seq->poll_event != atomic_read(&proc_poll_event)) { + seq->poll_event = atomic_read(&proc_poll_event); return POLLIN | POLLRDNORM | POLLERR | POLLPRI; } @@ -1799,24 +1778,16 @@ static const struct seq_operations swaps_op = { static int swaps_open(struct inode *inode, struct file *file) { - struct proc_swaps *s; + struct seq_file *seq; int ret; - s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); - if (!s) - return -ENOMEM; - - file->private_data = s; - ret = seq_open(file, &swaps_op); - if (ret) { - kfree(s); + if (ret) return ret; - } - s->seq.private = s; - s->event = atomic_read(&proc_poll_event); - return ret; + seq = file->private_data; + seq->poll_event = atomic_read(&proc_poll_event); + return 0; } static const struct file_operations proc_swaps_operations = { @@ -1844,49 +1815,24 @@ static int __init max_swapfiles_check(void) late_initcall(max_swapfiles_check); #endif -/* - * Written 01/25/92 by Simmule Turner, heavily changed by Linus. - * - * The swapon system call - */ -SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; - char *name = NULL; - struct block_device *bdev = NULL; - struct file *swap_file = NULL; - struct address_space *mapping; unsigned int type; - int i, prev; - int error; - union swap_header *swap_header; - unsigned int nr_good_pages; - int nr_extents = 0; - sector_t span; - unsigned long maxpages; - unsigned long swapfilepages; - unsigned char *swap_map = NULL; - struct page *page = NULL; - struct inode *inode = NULL; - int did_down = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) - return -ENOMEM; + return ERR_PTR(-ENOMEM); spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) break; } - error = -EPERM; if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); kfree(p); - goto out; + return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; @@ -1911,81 +1857,49 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->next = -1; spin_unlock(&swap_lock); - name = getname(specialfile); - error = PTR_ERR(name); - if (IS_ERR(name)) { - name = NULL; - goto bad_swap_2; - } - swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); - error = PTR_ERR(swap_file); - if (IS_ERR(swap_file)) { - swap_file = NULL; - goto bad_swap_2; - } - - p->swap_file = swap_file; - mapping = swap_file->f_mapping; - inode = mapping->host; - - error = -EBUSY; - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = swap_info[i]; + return p; +} - if (i == type || !q->swap_file) - continue; - if (mapping == q->swap_file->f_mapping) - goto bad_swap; - } +static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) +{ + int error; - error = -EINVAL; if (S_ISBLK(inode->i_mode)) { - bdev = bdgrab(I_BDEV(inode)); - error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, + p->bdev = bdgrab(I_BDEV(inode)); + error = blkdev_get(p->bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, sys_swapon); if (error < 0) { - bdev = NULL; - error = -EINVAL; - goto bad_swap; + p->bdev = NULL; + return -EINVAL; } - p->old_block_size = block_size(bdev); - error = set_blocksize(bdev, PAGE_SIZE); + p->old_block_size = block_size(p->bdev); + error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) - goto bad_swap; - p->bdev = bdev; + return error; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; mutex_lock(&inode->i_mutex); - did_down = 1; - if (IS_SWAPFILE(inode)) { - error = -EBUSY; - goto bad_swap; - } - } else { - goto bad_swap; - } + if (IS_SWAPFILE(inode)) + return -EBUSY; + } else + return -EINVAL; - swapfilepages = i_size_read(inode) >> PAGE_SHIFT; + return 0; +} - /* - * Read the swap header. - */ - if (!mapping->a_ops->readpage) { - error = -EINVAL; - goto bad_swap; - } - page = read_mapping_page(mapping, 0, swap_file); - if (IS_ERR(page)) { - error = PTR_ERR(page); - goto bad_swap; - } - swap_header = kmap(page); +static unsigned long read_swap_header(struct swap_info_struct *p, + union swap_header *swap_header, + struct inode *inode) +{ + int i; + unsigned long maxpages; + unsigned long swapfilepages; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { printk(KERN_ERR "Unable to find swap-space signature\n"); - error = -EINVAL; - goto bad_swap; + return 0; } /* swap partition endianess hack... */ @@ -2001,8 +1915,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) printk(KERN_WARNING "Unable to handle swap header version %d\n", swap_header->info.version); - error = -EINVAL; - goto bad_swap; + return 0; } p->lowest_bit = 1; @@ -2011,20 +1924,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number of - * bits for the swap offset in the swp_entry_t type and - * 2) the number of bits in the a swap pte as defined by - * the different architectures. In order to find the - * largest possible bit mask a swap entry with swap type 0 + * device. There are three limiting factors: 1) the number + * of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte as defined by the + * the different architectures, and 3) the number of free bits + * in an exceptional radix_tree entry. In order to find the + * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again and finally the swap + * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a - * swap pte. + * swap pte. Then the same is done for a radix_tree entry. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + swp_entry_to_pte(swp_entry(0, ~0UL)))); + maxpages = swp_offset(radix_to_swp_entry( + swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; + if (maxpages > swap_header->info.last_page) { maxpages = swap_header->info.last_page + 1; /* p->max is an unsigned int: don't overflow it */ @@ -2033,62 +1950,156 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } p->highest_bit = maxpages - 1; - error = -EINVAL; if (!maxpages) - goto bad_swap; + return 0; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); - goto bad_swap; + return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) - goto bad_swap; + return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; + return 0; - /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages); - if (!swap_map) { - error = -ENOMEM; - goto bad_swap; - } + return maxpages; +} + +static int setup_swap_map_and_extents(struct swap_info_struct *p, + union swap_header *swap_header, + unsigned char *swap_map, + unsigned long maxpages, + sector_t *span) +{ + int i; + unsigned int nr_good_pages; + int nr_extents; - memset(swap_map, 0, maxpages); nr_good_pages = maxpages - 1; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; - if (page_nr == 0 || page_nr > swap_header->info.last_page) { - error = -EINVAL; - goto bad_swap; - } + if (page_nr == 0 || page_nr > swap_header->info.last_page) + return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; } } - error = swap_cgroup_swapon(type, maxpages); - if (error) - goto bad_swap; - if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; p->pages = nr_good_pages; - nr_extents = setup_swap_extents(p, &span); - if (nr_extents < 0) { - error = nr_extents; - goto bad_swap; - } + nr_extents = setup_swap_extents(p, span); + if (nr_extents < 0) + return nr_extents; nr_good_pages = p->pages; } if (!nr_good_pages) { printk(KERN_WARNING "Empty swap-file\n"); + return -EINVAL; + } + + return nr_extents; +} + +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +{ + struct swap_info_struct *p; + char *name; + struct file *swap_file = NULL; + struct address_space *mapping; + int i; + int prio; + int error; + union swap_header *swap_header; + int nr_extents; + sector_t span; + unsigned long maxpages; + unsigned char *swap_map = NULL; + struct page *page = NULL; + struct inode *inode = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = alloc_swap_info(); + if (IS_ERR(p)) + return PTR_ERR(p); + + name = getname(specialfile); + if (IS_ERR(name)) { + error = PTR_ERR(name); + name = NULL; + goto bad_swap; + } + swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swap_file)) { + error = PTR_ERR(swap_file); + swap_file = NULL; + goto bad_swap; + } + + p->swap_file = swap_file; + mapping = swap_file->f_mapping; + + for (i = 0; i < nr_swapfiles; i++) { + struct swap_info_struct *q = swap_info[i]; + + if (q == p || !q->swap_file) + continue; + if (mapping == q->swap_file->f_mapping) { + error = -EBUSY; + goto bad_swap; + } + } + + inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + error = claim_swapfile(p, inode); + if (unlikely(error)) + goto bad_swap; + + /* + * Read the swap header. + */ + if (!mapping->a_ops->readpage) { + error = -EINVAL; + goto bad_swap; + } + page = read_mapping_page(mapping, 0, swap_file); + if (IS_ERR(page)) { + error = PTR_ERR(page); + goto bad_swap; + } + swap_header = kmap(page); + + maxpages = read_swap_header(p, swap_header, inode); + if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap; } + /* OK, set up the swap map and apply the bad block list */ + swap_map = vzalloc(maxpages); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap; + } + + error = swap_cgroup_swapon(p->type, maxpages); + if (error) + goto bad_swap; + + nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, + maxpages, &span); + if (unlikely(nr_extents < 0)) { + error = nr_extents; + goto bad_swap; + } + if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { p->flags |= SWP_SOLIDSTATE; @@ -2099,58 +2110,46 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } mutex_lock(&swapon_mutex); - spin_lock(&swap_lock); + prio = -1; if (swap_flags & SWAP_FLAG_PREFER) - p->prio = + prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - else - p->prio = --least_priority; - p->swap_map = swap_map; - p->flags |= SWP_WRITEOK; - nr_swap_pages += nr_good_pages; - total_swap_pages += nr_good_pages; + enable_swap_info(p, prio, swap_map); printk(KERN_INFO "Adding %uk swap on %s. " "Priority:%d extents:%d across:%lluk %s%s\n", - nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, + p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : ""); - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = type; - else - swap_info[prev]->next = type; - spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); + if (S_ISREG(inode->i_mode)) + inode->i_flags |= S_SWAPFILE; error = 0; goto out; bad_swap: - if (bdev) { - set_blocksize(bdev, p->old_block_size); - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + if (inode && S_ISBLK(inode->i_mode) && p->bdev) { + set_blocksize(p->bdev, p->old_block_size); + blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); - swap_cgroup_swapoff(type); -bad_swap_2: + swap_cgroup_swapoff(p->type); spin_lock(&swap_lock); p->swap_file = NULL; p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); - if (swap_file) + if (swap_file) { + if (inode && S_ISREG(inode->i_mode)) { + mutex_unlock(&inode->i_mutex); + inode = NULL; + } filp_close(swap_file, NULL); + } out: if (page && !IS_ERR(page)) { kunmap(page); @@ -2158,11 +2157,8 @@ out: } if (name) putname(name); - if (did_down) { - if (!error) - inode->i_flags |= S_SWAPFILE; + if (inode && S_ISREG(inode->i_mode)) mutex_unlock(&inode->i_mutex); - } return error; } diff --git a/mm/thrash.c b/mm/thrash.c index 2372d4ed5dd..e53f7d02c17 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -6,7 +6,7 @@ * Released under the GPL, see the file COPYING for details. * * Simple token based thrashing protection, using the algorithm - * described in: http://www.cs.wm.edu/~sjiang/token.pdf + * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html * * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> * Improved algorithm to pass token: @@ -21,14 +21,40 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/swap.h> +#include <linux/memcontrol.h> + +#include <trace/events/vmscan.h> + +#define TOKEN_AGING_INTERVAL (0xFF) static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; -static unsigned int global_faults; +struct mem_cgroup *swap_token_memcg; + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + + memcg = try_get_mem_cgroup_from_mm(mm); + if (memcg) + css_put(mem_cgroup_css(memcg)); + + return memcg; +} +#else +static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) +{ + return NULL; +} +#endif void grab_swap_token(struct mm_struct *mm) { int current_interval; + unsigned int old_prio = mm->token_priority; + static unsigned int global_faults; + static unsigned int last_aging; global_faults++; @@ -38,40 +64,92 @@ void grab_swap_token(struct mm_struct *mm) return; /* First come first served */ - if (swap_token_mm == NULL) { - mm->token_priority = mm->token_priority + 2; - swap_token_mm = mm; - goto out; + if (!swap_token_mm) + goto replace_token; + + /* + * Usually, we don't need priority aging because long interval faults + * makes priority decrease quickly. But there is one exception. If the + * token owner task is sleeping, it never make long interval faults. + * Thus, we need a priority aging mechanism instead. The requirements + * of priority aging are + * 1) An aging interval is reasonable enough long. Too short aging + * interval makes quick swap token lost and decrease performance. + * 2) The swap token owner task have to get priority aging even if + * it's under sleep. + */ + if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { + swap_token_mm->token_priority /= 2; + last_aging = global_faults; } - if (mm != swap_token_mm) { - if (current_interval < mm->last_interval) - mm->token_priority++; - else { - if (likely(mm->token_priority > 0)) - mm->token_priority--; - } - /* Check if we deserve the token */ - if (mm->token_priority > swap_token_mm->token_priority) { - mm->token_priority += 2; - swap_token_mm = mm; - } - } else { - /* Token holder came in again! */ + if (mm == swap_token_mm) { mm->token_priority += 2; + goto update_priority; + } + + if (current_interval < mm->last_interval) + mm->token_priority++; + else { + if (likely(mm->token_priority > 0)) + mm->token_priority--; } + /* Check if we deserve the token */ + if (mm->token_priority > swap_token_mm->token_priority) + goto replace_token; + +update_priority: + trace_update_swap_token_priority(mm, old_prio, swap_token_mm); + out: mm->faultstamp = global_faults; mm->last_interval = current_interval; spin_unlock(&swap_token_lock); + return; + +replace_token: + mm->token_priority += 2; + trace_replace_swap_token(swap_token_mm, mm); + swap_token_mm = mm; + swap_token_memcg = swap_token_memcg_from_mm(mm); + last_aging = global_faults; + goto out; } /* Called on process exit. */ void __put_swap_token(struct mm_struct *mm) { spin_lock(&swap_token_lock); - if (likely(mm == swap_token_mm)) + if (likely(mm == swap_token_mm)) { + trace_put_swap_token(swap_token_mm); swap_token_mm = NULL; + swap_token_memcg = NULL; + } spin_unlock(&swap_token_lock); } + +static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) +{ + if (!a) + return true; + if (!b) + return true; + if (a == b) + return true; + return false; +} + +void disable_swap_token(struct mem_cgroup *memcg) +{ + /* memcg reclaim don't disable unrelated mm token. */ + if (match_memcg(memcg, swap_token_memcg)) { + spin_lock(&swap_token_lock); + if (match_memcg(memcg, swap_token_memcg)) { + trace_disable_swap_token(swap_token_mm); + swap_token_mm = NULL; + swap_token_memcg = NULL; + } + spin_unlock(&swap_token_lock); + } +} diff --git a/mm/truncate.c b/mm/truncate.c index d64296be00d..b40ac6d4e86 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,6 +19,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ +#include <linux/cleancache.h> #include "internal.h" @@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); + cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -106,9 +108,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); clear_page_mlock(page); - remove_from_page_cache(page); ClearPageMappedToDisk(page); - page_cache_release(page); /* pagecache ref */ + delete_from_page_cache(page); return 0; } @@ -198,9 +199,6 @@ int invalidate_inode_page(struct page *page) * The first pass will remove most pages, so the search cost of the second pass * is low. * - * When looking at page->index outside the page lock we need to be careful to - * copy it into a local to avoid races (it could change at any time). - * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. @@ -209,12 +207,13 @@ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - pgoff_t end; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; - pgoff_t next; + pgoff_t index; + pgoff_t end; int i; + cleancache_flush_inode(mapping); if (mapping->nrpages == 0) return; @@ -222,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping, end = (lend >> PAGE_CACHE_SHIFT); pagevec_init(&pvec, 0); - next = start; - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - if (page_index > end) { - next = page_index; + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; - } - if (page_index > next) - next = page_index; - next++; if (!trylock_page(page)) continue; + WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -250,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } if (partial) { @@ -262,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping, } } - next = start; + index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - if (next == start) + if (!pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + if (index == start) break; - next = start; + index = start; continue; } - if (pvec.pages[0]->index > end) { + if (index == start && pvec.pages[0]->index > end) { pagevec_release(&pvec); break; } @@ -279,19 +277,22 @@ void truncate_inode_pages_range(struct address_space *mapping, for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - if (page->index > end) + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; + lock_page(page); + WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); - if (page->index > next) - next = page->index; - next++; unlock_page(page); } pagevec_release(&pvec); mem_cgroup_uncharge_end(); + index++; } + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -301,6 +302,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range); * @lstart: offset from which to truncate * * Called under (and serialised by) inode->i_mutex. + * + * Note: When this function returns, there can be a page in the process of + * deletion (inside __delete_from_page_cache()) in the specified range. Thus + * mapping->nrpages can be non-zero when this function returns even after + * truncation of the whole mapping. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { @@ -322,48 +328,53 @@ EXPORT_SYMBOL(truncate_inode_pages); * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) + pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next = start; - unsigned long ret = 0; + pgoff_t index = start; + unsigned long ret; + unsigned long count = 0; int i; + /* + * Note: this function may get called on a shmem/tmpfs mapping: + * pagevec_lookup() might then return 0 prematurely (because it + * got a gangful of swap entries); but it's hardly worth worrying + * about - it can rarely have anything to free from such a mapping + * (most pages are dirty), and already skips over any difficulties. + */ + pagevec_init(&pvec, 0); - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t index; - int lock_failed; - - lock_failed = !trylock_page(page); - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ + /* We rely upon deletion not changing page->index */ index = page->index; - if (index > next) - next = index; - next++; - if (lock_failed) - continue; - - ret += invalidate_inode_page(page); + if (index > end) + break; + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); + ret = invalidate_inode_page(page); unlock_page(page); - if (next > end) - break; + /* + * Invalidation is a hint that the page is no longer + * of interest and try to speed up its reclaim. + */ + if (!ret) + deactivate_page(page); + count += ret; } pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } - return ret; + return count; } EXPORT_SYMBOL(invalidate_mapping_pages); @@ -389,7 +400,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) clear_page_mlock(page); BUG_ON(page_has_private(page)); - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -427,36 +438,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next; + pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; - int wrapped = 0; + cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); - next = start; - while (next <= end && !wrapped && - pagevec_lookup(&pvec, mapping, next, - min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index; + + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) + break; lock_page(page); + WARN_ON(page->index != index); if (page->mapping != mapping) { unlock_page(page); continue; } - page_index = page->index; - next = page_index + 1; - if (next == 0) - wrapped = 1; - if (page_index > end) { - unlock_page(page); - break; - } wait_on_page_writeback(page); if (page_mapped(page)) { if (!did_range_unmap) { @@ -464,9 +471,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - (loff_t)page_index<<PAGE_CACHE_SHIFT, - (loff_t)(end - page_index + 1) - << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_CACHE_SHIFT, + (loff_t)(1 + end - index) + << PAGE_CACHE_SHIFT, 0); did_range_unmap = 1; } else { @@ -474,8 +481,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Just zap this page */ unmap_mapping_range(mapping, - (loff_t)page_index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); + (loff_t)index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); } } BUG_ON(page_mapped(page)); @@ -491,7 +498,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } + cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); @@ -514,8 +523,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode - * @old: old file offset - * @new: new file offset + * @oldsize: old file size + * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache * is called. @@ -527,9 +536,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ -void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) { struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(newsize, PAGE_SIZE); /* * unmap_mapping_range is called twice, first simply for @@ -540,9 +550,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) * truncate_inode_pages finishes, hence the second * unmap_mapping_range call must be made for correctness. */ - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + unmap_mapping_range(mapping, holebegin, 0, 1); + truncate_inode_pages(mapping, newsize); + unmap_mapping_range(mapping, holebegin, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); @@ -572,22 +582,47 @@ EXPORT_SYMBOL(truncate_setsize); /** * vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used - * @offset: file offset to start truncating + * @newsize: file offset to start truncating * * This function is deprecated and truncate_setsize or truncate_pagecache * should be used instead, together with filesystem specific block truncation. */ -int vmtruncate(struct inode *inode, loff_t offset) +int vmtruncate(struct inode *inode, loff_t newsize) { int error; - error = inode_newsize_ok(inode, offset); + error = inode_newsize_ok(inode, newsize); if (error) return error; - truncate_setsize(inode, offset); + truncate_setsize(inode, newsize); if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; } EXPORT_SYMBOL(vmtruncate); + +int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(lstart, PAGE_SIZE); + loff_t holelen = 1 + lend - holebegin; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op->truncate_range) + return -ENOSYS; + + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + unmap_mapping_range(mapping, holebegin, holelen, 1); + inode->i_op->truncate_range(inode, lstart, lend); + /* unmap again to remove racily COWed private pages */ + unmap_mapping_range(mapping, holebegin, holelen, 1); + mutex_unlock(&inode->i_mutex); + + return 0; +} diff --git a/mm/util.c b/mm/util.c index f126975ef23..88ea1bd661c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -6,6 +6,8 @@ #include <linux/sched.h> #include <asm/uaccess.h> +#include "internal.h" + #define CREATE_TRACE_POINTS #include <trace/events/kmem.h> @@ -215,6 +217,28 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); +void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent) +{ + struct vm_area_struct *next; + + vma->vm_prev = prev; + if (prev) { + next = prev->vm_next; + prev->vm_next = vma; + } else { + mm->mmap = vma; + if (rb_parent) + next = rb_entry(rb_parent, + struct vm_area_struct, vm_rb); + else + next = NULL; + } + vma->vm_next = next; + if (next) + next->vm_prev = vma; +} + #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { @@ -227,7 +251,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. - * If the architecture not support this fucntion, simply return with no + * If the architecture not support this function, simply return with no * page pinned */ int __attribute__((weak)) __get_user_pages_fast(unsigned long start, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f9b166732e7..7ef0903058e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -26,7 +26,7 @@ #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -261,8 +261,15 @@ struct vmap_area { }; static DEFINE_SPINLOCK(vmap_area_lock); -static struct rb_root vmap_area_root = RB_ROOT; static LIST_HEAD(vmap_area_list); +static struct rb_root vmap_area_root = RB_ROOT; + +/* The vmap cache globals are protected by vmap_area_lock */ +static struct rb_node *free_vmap_cache; +static unsigned long cached_hole_size; +static unsigned long cached_vstart; +static unsigned long cached_align; + static unsigned long vmap_area_pcpu_hole; static struct vmap_area *__find_vmap_area(unsigned long addr) @@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, struct rb_node *n; unsigned long addr; int purged = 0; + struct vmap_area *first; BUG_ON(!size); BUG_ON(size & ~PAGE_MASK); + BUG_ON(!is_power_of_2(align)); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); @@ -341,91 +350,127 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, return ERR_PTR(-ENOMEM); retry: - addr = ALIGN(vstart, align); - spin_lock(&vmap_area_lock); - if (addr + size - 1 < addr) - goto overflow; + /* + * Invalidate cache if we have more permissive parameters. + * cached_hole_size notes the largest hole noticed _below_ + * the vmap_area cached in free_vmap_cache: if size fits + * into that hole, we want to scan from vstart to reuse + * the hole instead of allocating above free_vmap_cache. + * Note that __free_vmap_area may update free_vmap_cache + * without updating cached_hole_size or cached_align. + */ + if (!free_vmap_cache || + size < cached_hole_size || + vstart < cached_vstart || + align < cached_align) { +nocache: + cached_hole_size = 0; + free_vmap_cache = NULL; + } + /* record if we encounter less permissive parameters */ + cached_vstart = vstart; + cached_align = align; + + /* find starting point for our search */ + if (free_vmap_cache) { + first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); + addr = ALIGN(first->va_end, align); + if (addr < vstart) + goto nocache; + if (addr + size - 1 < addr) + goto overflow; - /* XXX: could have a last_hole cache */ - n = vmap_area_root.rb_node; - if (n) { - struct vmap_area *first = NULL; + } else { + addr = ALIGN(vstart, align); + if (addr + size - 1 < addr) + goto overflow; + + n = vmap_area_root.rb_node; + first = NULL; - do { + while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end >= addr) { - if (!first && tmp->va_start < addr + size) - first = tmp; - n = n->rb_left; - } else { first = tmp; + if (tmp->va_start <= addr) + break; + n = n->rb_left; + } else n = n->rb_right; - } - } while (n); + } if (!first) goto found; - - if (first->va_end < addr) { - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct vmap_area, rb_node); - else - goto found; - } - - while (addr + size > first->va_start && addr + size <= vend) { - addr = ALIGN(first->va_end + PAGE_SIZE, align); - if (addr + size - 1 < addr) - goto overflow; - - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct vmap_area, rb_node); - else - goto found; - } } -found: - if (addr + size > vend) { -overflow: - spin_unlock(&vmap_area_lock); - if (!purged) { - purge_vmap_area_lazy(); - purged = 1; - goto retry; - } - if (printk_ratelimit()) - printk(KERN_WARNING - "vmap allocation for size %lu failed: " - "use vmalloc=<size> to increase size.\n", size); - kfree(va); - return ERR_PTR(-EBUSY); + + /* from the starting point, walk areas until a suitable hole is found */ + while (addr + size > first->va_start && addr + size <= vend) { + if (addr + cached_hole_size < first->va_start) + cached_hole_size = first->va_start - addr; + addr = ALIGN(first->va_end, align); + if (addr + size - 1 < addr) + goto overflow; + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; } - BUG_ON(addr & (align-1)); +found: + if (addr + size > vend) + goto overflow; va->va_start = addr; va->va_end = addr + size; va->flags = 0; __insert_vmap_area(va); + free_vmap_cache = &va->rb_node; spin_unlock(&vmap_area_lock); - return va; -} + BUG_ON(va->va_start & (align-1)); + BUG_ON(va->va_start < vstart); + BUG_ON(va->va_end > vend); -static void rcu_free_va(struct rcu_head *head) -{ - struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); + return va; +overflow: + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + if (printk_ratelimit()) + printk(KERN_WARNING + "vmap allocation for size %lu failed: " + "use vmalloc=<size> to increase size.\n", size); kfree(va); + return ERR_PTR(-EBUSY); } static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + + if (free_vmap_cache) { + if (va->va_end < cached_vstart) { + free_vmap_cache = NULL; + } else { + struct vmap_area *cache; + cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); + if (va->va_start <= cache->va_start) { + free_vmap_cache = rb_prev(&va->rb_node); + /* + * We don't try to update cached_hole_size or + * cached_align, but it won't go very wrong. + */ + } + } + } rb_erase(&va->rb_node, &vmap_area_root); RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); @@ -439,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va) if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); - call_rcu(&va->rcu_head, rcu_free_va); + kfree_rcu(va, rcu_head); } /* @@ -680,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr) #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ -#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ - VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ - VMALLOC_PAGES / NR_CPUS / 16)) +#define VMAP_BBMAP_BITS \ + VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) @@ -785,13 +831,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) return vb; } -static void rcu_free_vb(struct rcu_head *head) -{ - struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); - - kfree(vb); -} - static void free_vmap_block(struct vmap_block *vb) { struct vmap_block *tmp; @@ -804,7 +843,7 @@ static void free_vmap_block(struct vmap_block *vb) BUG_ON(tmp != vb); free_vmap_area_noflush(vb->va); - call_rcu(&vb->rcu_head, rcu_free_vb); + kfree_rcu(vb, rcu_head); } static void purge_fragmented_blocks(int cpu) @@ -1482,6 +1521,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) { + const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; @@ -1508,11 +1548,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; + gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; if (node < 0) - page = alloc_page(gfp_mask); + page = alloc_page(tmp_mask); else - page = alloc_pages_node(node, gfp_mask, 0); + page = alloc_pages_node(node, tmp_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1527,6 +1568,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: + warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, " + "allocated %ld of %ld bytes\n", + (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); return NULL; } @@ -1951,8 +1995,6 @@ finished: * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any informaion, as /dev/kmem. - * - * The caller should guarantee KM_USER1 is not used. */ long vwrite(char *buf, char *addr, unsigned long count) @@ -2098,10 +2140,6 @@ struct vm_struct *alloc_vm_area(size_t size) return NULL; } - /* Make sure the pagetables are constructed in process kernel - mappings */ - vmalloc_sync_all(); - return area; } EXPORT_SYMBOL_GPL(alloc_vm_area); diff --git a/mm/vmscan.c b/mm/vmscan.c index 6771ea70bfe..7ef69124fa3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,8 @@ #include <linux/memcontrol.h> #include <linux/delayacct.h> #include <linux/sysctl.h> +#include <linux/oom.h> +#include <linux/prefetch.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -93,8 +95,6 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - int swappiness; - int order; /* @@ -105,6 +105,7 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + struct memcg_scanrecord *memcg_record; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -171,7 +172,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); + return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, + zone_to_nid(zone), zone_idx(zone), BIT(lru)); return zone_page_state(zone, NR_LRU_BASE + lru); } @@ -200,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker) } EXPORT_SYMBOL(unregister_shrinker); +static inline int do_shrinker_shrink(struct shrinker *shrinker, + struct shrink_control *sc, + unsigned long nr_to_scan) +{ + sc->nr_to_scan = nr_to_scan; + return (*shrinker->shrink)(shrinker, sc); +} + #define SHRINK_BATCH 128 /* * Call the shrink functions to age shrinkable caches @@ -220,67 +230,114 @@ EXPORT_SYMBOL(unregister_shrinker); * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) +unsigned long shrink_slab(struct shrink_control *shrink, + unsigned long nr_pages_scanned, + unsigned long lru_pages) { struct shrinker *shrinker; unsigned long ret = 0; - if (scanned == 0) - scanned = SWAP_CLUSTER_MAX; + if (nr_pages_scanned == 0) + nr_pages_scanned = SWAP_CLUSTER_MAX; - if (!down_read_trylock(&shrinker_rwsem)) - return 1; /* Assume we'll be able to shrink next time */ + if (!down_read_trylock(&shrinker_rwsem)) { + /* Assume we'll be able to shrink next time */ + ret = 1; + goto out; + } list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; unsigned long max_pass; + int shrink_ret = 0; + long nr; + long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; - max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); - delta = (4 * scanned) / shrinker->seeks; + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + do { + nr = shrinker->nr; + } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + + total_scan = nr; + max_pass = do_shrinker_shrink(shrinker, shrink, 0); + delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) { + total_scan += delta; + if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to " "delete nr=%ld\n", - shrinker->shrink, shrinker->nr); - shrinker->nr = max_pass; + shrinker->shrink, total_scan); + total_scan = max_pass; } /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * max_pass. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < max_pass / 4) + total_scan = min(total_scan, max_pass / 2); + + /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (shrinker->nr > max_pass * 2) - shrinker->nr = max_pass * 2; + if (total_scan > max_pass * 2) + total_scan = max_pass * 2; - total_scan = shrinker->nr; - shrinker->nr = 0; + trace_mm_shrink_slab_start(shrinker, shrink, nr, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); - while (total_scan >= SHRINK_BATCH) { - long this_scan = SHRINK_BATCH; - int shrink_ret; + while (total_scan >= batch_size) { int nr_before; - nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); - shrink_ret = (*shrinker->shrink)(shrinker, this_scan, - gfp_mask); + nr_before = do_shrinker_shrink(shrinker, shrink, 0); + shrink_ret = do_shrinker_shrink(shrinker, shrink, + batch_size); if (shrink_ret == -1) break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, this_scan); - total_scan -= this_scan; + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; cond_resched(); } - shrinker->nr += total_scan; + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + do { + nr = shrinker->nr; + new_nr = total_scan + nr; + if (total_scan <= 0) + break; + } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + + trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } up_read(&shrinker_rwsem); +out: + cond_resched(); return ret; } @@ -358,7 +415,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi, static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page_nosync(page); + lock_page(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); @@ -514,7 +571,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) freepage = mapping->a_ops->freepage; - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -936,7 +993,7 @@ keep_lumpy: * back off and wait for congestion to clear because further reclaim * will encounter the same problem */ - if (nr_dirty == nr_congested && nr_dirty != 0) + if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) zone_set_flag(zone, ZONE_CONGESTED); free_page_list(&free_pages); @@ -1065,7 +1122,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * surrounding the tag page. Only take those pages of * the same active state as that tag page. We may safely * round the target page pfn down to the requested order - * as the mem_map is guarenteed valid out to MAX_ORDER, + * as the mem_map is guaranteed valid out to MAX_ORDER, * where that page is in a different zone we will detect * it from its zone id and abort this block scan. */ @@ -1108,8 +1165,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_lumpy_dirty++; scan++; } else { - /* the page is freed already. */ - if (!page_count(cursor_page)) + /* + * Check if the page is freed already. + * + * We can't use page_count() as that + * requires compound_head and we don't + * have a pin on the page here. If a + * page is tail, we may or may not + * have isolated the head, so assume + * it's not free, it'd be tricky to + * track the head status without a + * page pin. + */ + if (!PageTail(cursor_page) && + !atomic_read(&cursor_page->_count)) continue; break; } @@ -1200,13 +1269,16 @@ int isolate_lru_page(struct page *page) { int ret = -EBUSY; + VM_BUG_ON(!page_count(page)); + if (PageLRU(page)) { struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && get_page_unless_zero(page)) { + if (PageLRU(page)) { int lru = page_lru(page); ret = 0; + get_page(page); ClearPageLRU(page); del_page_from_lru_list(zone, page, lru); @@ -1277,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1320,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; + if (!scanning_global_lru(sc)) { + sc->memcg_record->nr_scanned[0] += *nr_anon; + sc->memcg_record->nr_scanned[1] += *nr_file; + } } /* @@ -1433,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, nr_reclaimed += shrink_page_list(&page_list, zone, sc); } + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_freed[file] += nr_reclaimed; + local_irq_disable(); if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); @@ -1532,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[file] += nr_taken; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) @@ -1583,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * get_scan_ratio. */ reclaim_stat->recent_rotated[file] += nr_rotated; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); @@ -1698,24 +1783,11 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } -/* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan) +static int vmscan_swappiness(struct scan_control *sc) { - unsigned long nr; - - *nr_saved_scan += nr_to_scan; - nr = *nr_saved_scan; - - if (nr >= SWAP_CLUSTER_MAX) - *nr_saved_scan = 0; - else - nr = 0; - - return nr; + if (scanning_global_lru(sc)) + return vm_swappiness; + return mem_cgroup_swappiness(sc->mem_cgroup); } /* @@ -1736,6 +1808,23 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, u64 fraction[2], denominator; enum lru_list l; int noswap = 0; + int force_scan = 0; + unsigned long nr_force_scan[2]; + + + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + + if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { + /* kswapd does zone balancing and need to scan this zone */ + if (scanning_global_lru(sc) && current_is_kswapd()) + force_scan = 1; + /* memcg may have small limit and need to avoid priority drop */ + if (!scanning_global_lru(sc)) + force_scan = 1; + } /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || (nr_swap_pages <= 0)) { @@ -1743,14 +1832,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 0; fraction[1] = 1; denominator = 1; + nr_force_scan[0] = 0; + nr_force_scan[1] = SWAP_CLUSTER_MAX; goto out; } - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); - if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, @@ -1759,6 +1845,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 1; fraction[1] = 0; denominator = 1; + nr_force_scan[0] = SWAP_CLUSTER_MAX; + nr_force_scan[1] = 0; goto out; } } @@ -1767,8 +1855,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; + anon_prio = vmscan_swappiness(sc); + file_prio = 200 - vmscan_swappiness(sc); /* * OK, so we have swap space and a fair amount of page cache @@ -1807,6 +1895,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; + if (force_scan) { + unsigned long scan = SWAP_CLUSTER_MAX; + nr_force_scan[0] = div64_u64(scan * ap, denominator); + nr_force_scan[1] = div64_u64(scan * fp, denominator); + } out: for_each_evictable_lru(l) { int file = is_file_lru(l); @@ -1817,8 +1910,19 @@ out: scan >>= priority; scan = div64_u64(scan * fraction[file], denominator); } - nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l]); + + /* + * If zone is small or memcg is small, nr[l] can be 0. + * This results no-scan on this priority and priority drop down. + * For global direct reclaim, it can visit next zone and tend + * not to have problems. For global kswapd, it's for zone + * balancing and it need to scan a small amounts. When using + * memcg, priority drop can cause big latency. So, it's better + * to scan small amount. See may_noscan above. + */ + if (!scan && force_scan) + scan = nr_force_scan[file]; + nr[l] = scan; } } @@ -1963,6 +2067,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, { struct zoneref *z; struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -1977,6 +2083,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; + /* need some check for avoid more shrink_zone() */ } shrink_zone(priority, zone, sc); @@ -1988,17 +2107,12 @@ static bool zone_reclaimable(struct zone *zone) return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; } -/* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. It can't handle OOM during hibernation. - * So let's check zone's unreclaimable in direct reclaim as well as kswapd. - */ +/* All zones in zonelist are unreclaimable? */ static bool all_unreclaimable(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; - bool all_unreclaimable = true; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2006,13 +2120,11 @@ static bool all_unreclaimable(struct zonelist *zonelist, continue; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone_reclaimable(zone)) { - all_unreclaimable = false; - break; - } + if (!zone->all_unreclaimable) + return false; } - return all_unreclaimable; + return true; } /* @@ -2032,7 +2144,8 @@ static bool all_unreclaimable(struct zonelist *zonelist, * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc) + struct scan_control *sc, + struct shrink_control *shrink) { int priority; unsigned long total_scanned = 0; @@ -2050,7 +2163,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) - disable_swap_token(); + disable_swap_token(sc->mem_cgroup); shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from @@ -2066,7 +2179,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, lru_pages += zone_reclaimable_pages(zone); } - shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); + shrink_slab(shrink, sc->nr_scanned, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -2108,6 +2221,14 @@ out: if (sc->nr_reclaimed) return sc->nr_reclaimed; + /* + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. Thus bypassing all_unreclaimable + * check. + */ + if (oom_killer_disabled) + return 0; + /* top priority shrink_zones still had more to do? don't OOM, then */ if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) return 1; @@ -2125,17 +2246,19 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .nodemask = nodemask, }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, + }; trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); @@ -2145,19 +2268,23 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - unsigned int swappiness, - struct zone *zone) + gfp_t gfp_mask, bool noswap, + struct zone *zone, + struct memcg_scanrecord *rec, + unsigned long *scanned) { struct scan_control sc = { + .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swappiness = swappiness, .order = 0, .mem_cgroup = mem, + .memcg_record = rec, }; + unsigned long start, end; + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2165,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); + start = sched_clock(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2173,6 +2301,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); + end = sched_clock(); + + if (rec) + rec->elapsed += end - start; + *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2182,30 +2315,46 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, - unsigned int swappiness) + struct memcg_scanrecord *rec) { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long start, end; + int nid; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, + .memcg_record = rec, .nodemask = NULL, /* we don't care the placement */ + .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, }; - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | - (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - zonelist = NODE_DATA(numa_node_id())->node_zonelists; + start = sched_clock(); + /* + * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't + * take care of from where we get pages. So the node where we start the + * scan does not need to be the current node. + */ + nid = mem_cgroup_select_victim_node(mem_cont); + + zonelist = NODE_DATA(nid)->node_zonelists; trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, sc.gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + end = sched_clock(); + if (rec) + rec->elapsed += end - start; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2224,7 +2373,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * o a 16M DMA zone that is balanced will not balance a zone on any * reasonable sized machine * o On all other machines, the top zone must be at least a reasonable - * precentage of the middle zones. For example, on 32-bit x86, highmem + * percentage of the middle zones. For example, on 32-bit x86, highmem * would need to be at least 256M for it to be balance a whole node. * Similarly, on x86-64 the Normal zone would need to be at least 1G * to balance a node on its own. These seemed like reasonable ratios. @@ -2238,7 +2387,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, for (i = 0; i <= classzone_idx; i++) present_pages += pgdat->node_zones[i].present_pages; - return balanced_pages > (present_pages >> 2); + /* A special case here: if zone has no page, we think it's balanced */ + return balanced_pages >= (present_pages >> 2); } /* is kswapd sleeping prematurely? */ @@ -2254,7 +2404,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, return true; /* Check the watermark levels */ - for (i = 0; i < pgdat->nr_zones; i++) { + for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) @@ -2272,7 +2422,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, } if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - classzone_idx, 0)) + i, 0)) all_zones_ok = false; else balanced += zone->present_pages; @@ -2284,7 +2434,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, * must be balanced */ if (order) - return pgdat_balanced(pgdat, balanced, classzone_idx); + return !pgdat_balanced(pgdat, balanced, classzone_idx); else return !all_zones_ok; } @@ -2320,6 +2470,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, @@ -2329,10 +2481,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * we want to put equal scanning pressure on each zone. */ .nr_to_reclaim = ULONG_MAX, - .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, + }; loop_again: total_scanned = 0; sc.nr_reclaimed = 0; @@ -2345,7 +2499,7 @@ loop_again: /* The swap token gets in the way of swapout... */ if (!priority) - disable_swap_token(); + disable_swap_token(NULL); all_zones_ok = 1; balanced = 0; @@ -2374,7 +2528,6 @@ loop_again: if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; - *classzone_idx = i; break; } } @@ -2397,9 +2550,9 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { - int compaction; struct zone *zone = pgdat->node_zones + i; int nr_slab; + unsigned long balance_gap; if (!populated_zone(zone)) continue; @@ -2409,45 +2562,42 @@ loop_again: sc.nr_scanned = 0; + nr_soft_scanned = 0; /* * Call soft limit reclaim before calling shrink_zone. - * For now we ignore the return value */ - mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + total_scanned += nr_soft_scanned; /* - * We put equal pressure on every zone, unless one - * zone has way too many pages free already. + * We put equal pressure on every zone, unless + * one zone has way too many pages free + * already. The "too many pages" is defined + * as the high wmark plus a "gap" where the + * gap is either the low watermark or 1% + * of the zone, whichever is smaller. */ + balance_gap = min(low_wmark_pages(zone), + (zone->present_pages + + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); if (!zone_watermark_ok_safe(zone, order, - 8*high_wmark_pages(zone), end_zone, 0)) + high_wmark_pages(zone) + balance_gap, + end_zone, 0)) { shrink_zone(priority, zone, &sc); - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, - lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_scanned += sc.nr_scanned; - - compaction = 0; - if (order && - zone_watermark_ok(zone, 0, - high_wmark_pages(zone), - end_zone, 0) && - !zone_watermark_ok(zone, order, - high_wmark_pages(zone), - end_zone, 0)) { - compact_zone_order(zone, - order, - sc.gfp_mask, false, - COMPACT_MODE_KSWAPD); - compaction = 1; + + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + total_scanned += sc.nr_scanned; + + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; } - if (zone->all_unreclaimable) - continue; - if (!compaction && nr_slab == 0 && - !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and * the reclaim ratio is low, start doing writepage @@ -2457,6 +2607,12 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + if (zone->all_unreclaimable) { + if (end_zone && end_zone == i) + end_zone--; + continue; + } + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; @@ -2635,8 +2791,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ static int kswapd(void *p) { - unsigned long order; - int classzone_idx; + unsigned long order, new_order; + int classzone_idx, new_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2666,17 +2822,23 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - order = 0; - classzone_idx = MAX_NR_ZONES - 1; + order = new_order = 0; + classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; for ( ; ; ) { - unsigned long new_order; - int new_classzone_idx; int ret; - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + /* + * If the last balance_pgdat was unsuccessful it's unlikely a + * new request of a similar or harder type will succeed soon + * so consider going to sleep on the basis we reclaimed at + */ + if (classzone_idx >= new_classzone_idx && order == new_order) { + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; + } + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' @@ -2689,7 +2851,7 @@ static int kswapd(void *p) order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + pgdat->classzone_idx = pgdat->nr_zones - 1; } ret = try_to_freeze(); @@ -2788,10 +2950,12 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .may_writepage = 1, .nr_to_reclaim = nr_to_reclaim, .hibernation_mode = 1, - .swappiness = vm_swappiness, .order = 0, }; - struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, + }; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; @@ -2800,7 +2964,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -2972,9 +3136,11 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .nr_to_reclaim = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, - .swappiness = vm_swappiness, .order = order, }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, + }; unsigned long nr_slab_pages0, nr_slab_pages1; cond_resched(); @@ -3016,7 +3182,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) unsigned long lru_pages = zone_reclaimable_pages(zone); /* No reclaimable slab or very low memory pressure */ - if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) + if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) break; /* Freed enough memory */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 0c3b5048773..20c18b7694b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -157,7 +157,7 @@ int calculate_normal_threshold(struct zone *zone) /* * Refresh the thresholds for each zone. */ -static void refresh_zone_stat_thresholds(void) +void refresh_zone_stat_thresholds(void) { struct zone *zone; int cpu; @@ -321,9 +321,12 @@ static inline void mod_state(struct zone *zone, /* * The fetching of the stat_threshold is racy. We may apply * a counter threshold to the wrong the cpu if we get - * rescheduled while executing here. However, the following - * will apply the threshold again and therefore bring the - * counter under the threshold. + * rescheduled while executing here. However, the next + * counter update will apply the threshold again and + * therefore bring the counter under the threshold again. + * + * Most of the time the thresholds are the same anyways + * for all cpus in a zone. */ t = this_cpu_read(pcp->stat_threshold); @@ -500,8 +503,12 @@ void refresh_cpu_vm_stats(int cpu) * z = the zone from which the allocation occurred. * * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. */ -void zone_statistics(struct zone *preferred_zone, struct zone *z) +void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) { if (z->zone_pgdat == preferred_zone->zone_pgdat) { __inc_zone_state(z, NUMA_HIT); @@ -509,7 +516,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) __inc_zone_state(z, NUMA_MISS); __inc_zone_state(preferred_zone, NUMA_FOREIGN); } - if (z->node == numa_node_id()) + if (z->node == ((flags & __GFP_OTHER_NODE) ? + preferred_zone->node : numa_node_id())) __inc_zone_state(z, NUMA_LOCAL); else __inc_zone_state(z, NUMA_OTHER); @@ -651,6 +659,138 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, } #endif +#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) +#ifdef CONFIG_ZONE_DMA +#define TEXT_FOR_DMA(xx) xx "_dma", +#else +#define TEXT_FOR_DMA(xx) +#endif + +#ifdef CONFIG_ZONE_DMA32 +#define TEXT_FOR_DMA32(xx) xx "_dma32", +#else +#define TEXT_FOR_DMA32(xx) +#endif + +#ifdef CONFIG_HIGHMEM +#define TEXT_FOR_HIGHMEM(xx) xx "_high", +#else +#define TEXT_FOR_HIGHMEM(xx) +#endif + +#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ + TEXT_FOR_HIGHMEM(xx) xx "_movable", + +const char * const vmstat_text[] = { + /* Zoned VM counters */ + "nr_free_pages", + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", + "nr_unevictable", + "nr_mlock", + "nr_anon_pages", + "nr_mapped", + "nr_file_pages", + "nr_dirty", + "nr_writeback", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", + "nr_page_table_pages", + "nr_kernel_stack", + "nr_unstable", + "nr_bounce", + "nr_vmscan_write", + "nr_writeback_temp", + "nr_isolated_anon", + "nr_isolated_file", + "nr_shmem", + "nr_dirtied", + "nr_written", + +#ifdef CONFIG_NUMA + "numa_hit", + "numa_miss", + "numa_foreign", + "numa_interleave", + "numa_local", + "numa_other", +#endif + "nr_anon_transparent_hugepages", + "nr_dirty_threshold", + "nr_dirty_background_threshold", + +#ifdef CONFIG_VM_EVENT_COUNTERS + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + + TEXTS_FOR_ZONES("pgalloc") + + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + + TEXTS_FOR_ZONES("pgrefill") + TEXTS_FOR_ZONES("pgsteal") + TEXTS_FOR_ZONES("pgscan_kswapd") + TEXTS_FOR_ZONES("pgscan_direct") + +#ifdef CONFIG_NUMA + "zone_reclaim_failed", +#endif + "pginodesteal", + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "kswapd_low_wmark_hit_quickly", + "kswapd_high_wmark_hit_quickly", + "kswapd_skip_congestion_wait", + "pageoutrun", + "allocstall", + + "pgrotated", + +#ifdef CONFIG_COMPACTION + "compact_blocks_moved", + "compact_pages_moved", + "compact_pagemigrate_failed", + "compact_stall", + "compact_fail", + "compact_success", +#endif + +#ifdef CONFIG_HUGETLB_PAGE + "htlb_buddy_alloc_success", + "htlb_buddy_alloc_fail", +#endif + "unevictable_pgs_culled", + "unevictable_pgs_scanned", + "unevictable_pgs_rescued", + "unevictable_pgs_mlocked", + "unevictable_pgs_munlocked", + "unevictable_pgs_cleared", + "unevictable_pgs_stranded", + "unevictable_pgs_mlockfreed", + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "thp_fault_alloc", + "thp_fault_fallback", + "thp_collapse_alloc", + "thp_collapse_alloc_failed", + "thp_split", +#endif + +#endif /* CONFIG_VM_EVENTS_COUNTERS */ +}; +#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ + + #ifdef CONFIG_PROC_FS static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) @@ -823,126 +963,6 @@ static const struct file_operations pagetypeinfo_file_ops = { .release = seq_release, }; -#ifdef CONFIG_ZONE_DMA -#define TEXT_FOR_DMA(xx) xx "_dma", -#else -#define TEXT_FOR_DMA(xx) -#endif - -#ifdef CONFIG_ZONE_DMA32 -#define TEXT_FOR_DMA32(xx) xx "_dma32", -#else -#define TEXT_FOR_DMA32(xx) -#endif - -#ifdef CONFIG_HIGHMEM -#define TEXT_FOR_HIGHMEM(xx) xx "_high", -#else -#define TEXT_FOR_HIGHMEM(xx) -#endif - -#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) xx "_movable", - -static const char * const vmstat_text[] = { - /* Zoned VM counters */ - "nr_free_pages", - "nr_inactive_anon", - "nr_active_anon", - "nr_inactive_file", - "nr_active_file", - "nr_unevictable", - "nr_mlock", - "nr_anon_pages", - "nr_mapped", - "nr_file_pages", - "nr_dirty", - "nr_writeback", - "nr_slab_reclaimable", - "nr_slab_unreclaimable", - "nr_page_table_pages", - "nr_kernel_stack", - "nr_unstable", - "nr_bounce", - "nr_vmscan_write", - "nr_writeback_temp", - "nr_isolated_anon", - "nr_isolated_file", - "nr_shmem", - "nr_dirtied", - "nr_written", - -#ifdef CONFIG_NUMA - "numa_hit", - "numa_miss", - "numa_foreign", - "numa_interleave", - "numa_local", - "numa_other", -#endif - "nr_anon_transparent_hugepages", - "nr_dirty_threshold", - "nr_dirty_background_threshold", - -#ifdef CONFIG_VM_EVENT_COUNTERS - "pgpgin", - "pgpgout", - "pswpin", - "pswpout", - - TEXTS_FOR_ZONES("pgalloc") - - "pgfree", - "pgactivate", - "pgdeactivate", - - "pgfault", - "pgmajfault", - - TEXTS_FOR_ZONES("pgrefill") - TEXTS_FOR_ZONES("pgsteal") - TEXTS_FOR_ZONES("pgscan_kswapd") - TEXTS_FOR_ZONES("pgscan_direct") - -#ifdef CONFIG_NUMA - "zone_reclaim_failed", -#endif - "pginodesteal", - "slabs_scanned", - "kswapd_steal", - "kswapd_inodesteal", - "kswapd_low_wmark_hit_quickly", - "kswapd_high_wmark_hit_quickly", - "kswapd_skip_congestion_wait", - "pageoutrun", - "allocstall", - - "pgrotated", - -#ifdef CONFIG_COMPACTION - "compact_blocks_moved", - "compact_pages_moved", - "compact_pagemigrate_failed", - "compact_stall", - "compact_fail", - "compact_success", -#endif - -#ifdef CONFIG_HUGETLB_PAGE - "htlb_buddy_alloc_success", - "htlb_buddy_alloc_fail", -#endif - "unevictable_pgs_culled", - "unevictable_pgs_scanned", - "unevictable_pgs_rescued", - "unevictable_pgs_mlocked", - "unevictable_pgs_munlocked", - "unevictable_pgs_cleared", - "unevictable_pgs_stranded", - "unevictable_pgs_mlockfreed", -#endif -}; - static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -1181,7 +1201,6 @@ static int __init setup_vmstat(void) #ifdef CONFIG_SMP int cpu; - refresh_zone_stat_thresholds(); register_cpu_notifier(&vmstat_notifier); for_each_online_cpu(cpu) |