diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 91 | ||||
-rw-r--r-- | mm/Makefile | 16 | ||||
-rw-r--r-- | mm/backing-dev.c | 18 | ||||
-rw-r--r-- | mm/bounce.c | 287 | ||||
-rw-r--r-- | mm/cma.c | 335 | ||||
-rw-r--r-- | mm/compaction.c | 343 | ||||
-rw-r--r-- | mm/dmapool.c | 31 | ||||
-rw-r--r-- | mm/early_ioremap.c | 245 | ||||
-rw-r--r-- | mm/filemap.c | 1295 | ||||
-rw-r--r-- | mm/fremap.c | 39 | ||||
-rw-r--r-- | mm/frontswap.c | 13 | ||||
-rw-r--r-- | mm/gup.c | 674 | ||||
-rw-r--r-- | mm/highmem.c | 86 | ||||
-rw-r--r-- | mm/huge_memory.c | 276 | ||||
-rw-r--r-- | mm/hugetlb.c | 878 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 52 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 3 | ||||
-rw-r--r-- | mm/internal.h | 54 | ||||
-rw-r--r-- | mm/iov_iter.c | 746 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 36 | ||||
-rw-r--r-- | mm/kmemleak.c | 180 | ||||
-rw-r--r-- | mm/ksm.c | 9 | ||||
-rw-r--r-- | mm/list_lru.c | 16 | ||||
-rw-r--r-- | mm/madvise.c | 5 | ||||
-rw-r--r-- | mm/memblock.c | 281 | ||||
-rw-r--r-- | mm/memcontrol.c | 2648 | ||||
-rw-r--r-- | mm/memory-failure.c | 152 | ||||
-rw-r--r-- | mm/memory.c | 1331 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 193 | ||||
-rw-r--r-- | mm/mempolicy.c | 237 | ||||
-rw-r--r-- | mm/mempool.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 141 | ||||
-rw-r--r-- | mm/mincore.c | 20 | ||||
-rw-r--r-- | mm/mlock.c | 11 | ||||
-rw-r--r-- | mm/mmap.c | 221 | ||||
-rw-r--r-- | mm/mmu_context.c | 3 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 40 | ||||
-rw-r--r-- | mm/mprotect.c | 56 | ||||
-rw-r--r-- | mm/mremap.c | 9 | ||||
-rw-r--r-- | mm/msync.c | 9 | ||||
-rw-r--r-- | mm/nobootmem.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 59 | ||||
-rw-r--r-- | mm/oom_kill.c | 34 | ||||
-rw-r--r-- | mm/page-writeback.c | 56 | ||||
-rw-r--r-- | mm/page_alloc.c | 689 | ||||
-rw-r--r-- | mm/page_cgroup.c | 12 | ||||
-rw-r--r-- | mm/page_io.c | 42 | ||||
-rw-r--r-- | mm/percpu-vm.c | 22 | ||||
-rw-r--r-- | mm/percpu.c | 211 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 2 | ||||
-rw-r--r-- | mm/process_vm_access.c | 272 | ||||
-rw-r--r-- | mm/readahead.c | 34 | ||||
-rw-r--r-- | mm/rmap.c | 95 | ||||
-rw-r--r-- | mm/shmem.c | 764 | ||||
-rw-r--r-- | mm/slab.c | 825 | ||||
-rw-r--r-- | mm/slab.h | 92 | ||||
-rw-r--r-- | mm/slab_common.c | 403 | ||||
-rw-r--r-- | mm/slob.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 603 | ||||
-rw-r--r-- | mm/sparse.c | 6 | ||||
-rw-r--r-- | mm/swap.c | 327 | ||||
-rw-r--r-- | mm/swap_state.c | 11 | ||||
-rw-r--r-- | mm/swapfile.c | 274 | ||||
-rw-r--r-- | mm/truncate.c | 172 | ||||
-rw-r--r-- | mm/util.c | 172 | ||||
-rw-r--r-- | mm/vmacache.c | 132 | ||||
-rw-r--r-- | mm/vmalloc.c | 53 | ||||
-rw-r--r-- | mm/vmscan.c | 632 | ||||
-rw-r--r-- | mm/vmstat.c | 33 | ||||
-rw-r--r-- | mm/workingset.c | 414 | ||||
-rw-r--r-- | mm/zbud.c | 101 | ||||
-rw-r--r-- | mm/zpool.c | 364 | ||||
-rw-r--r-- | mm/zsmalloc.c | 108 | ||||
-rw-r--r-- | mm/zswap.c | 131 |
74 files changed, 10901 insertions, 7355 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 2888024e0b0..886db215853 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -134,6 +134,9 @@ config HAVE_MEMBLOCK config HAVE_MEMBLOCK_NODE_MAP boolean +config HAVE_MEMBLOCK_PHYS_MAP + boolean + config ARCH_DISCARD_MEMBLOCK boolean @@ -216,6 +219,7 @@ config PAGEFLAGS_EXTENDED # config SPLIT_PTLOCK_CPUS int + default "999999" if !MMU default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 default "4" @@ -263,6 +267,9 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. +config ARCH_ENABLE_HUGEPAGE_MIGRATION + boolean + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT @@ -429,16 +436,6 @@ choice benefit. endchoice -config CROSS_MEMORY_ATTACH - bool "Cross Memory Support" - depends on MMU - default y - help - Enabling this option adds the system calls process_vm_readv and - process_vm_writev which allow a process with the correct privileges - to directly read from or write to to another process's address space. - See the man page for more details. - # # UP and nommu archs use km based percpu allocator # @@ -511,21 +508,34 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. -config ZBUD - tristate - default n +config CMA_AREAS + int "Maximum count of the CMA areas" + depends on CMA + default 7 help - A special purpose allocator for storing compressed pages. - It is designed to store up to two compressed pages per physical - page. While this design limits storage density, it has simple and - deterministic reclaim properties that make it preferable to a higher - density approach when reclaim will be used. + CMA allows to create CMA areas for particular purpose, mainly, + used as device private area. This parameter sets the maximum + number of CMA area in the system. + + If unsure, leave the default value "7". + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" depends on FRONTSWAP && CRYPTO=y select CRYPTO_LZO - select ZBUD + select ZPOOL default n help A lightweight compressed cache for swap pages. It takes @@ -541,20 +551,25 @@ config ZSWAP they have not be fully explored on the large set of potential configurations and workloads that exist. -config MEM_SOFT_DIRTY - bool "Track memory changes" - depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS - select PROC_PAGE_MONITOR +config ZPOOL + tristate "Common API for compressed memory storage" + default n help - This option enables memory changes tracking by introducing a - soft-dirty bit on pte-s. This bit it set when someone writes - into a page just as regular dirty bit, but unlike the latter - it can be cleared by hands. + Compressed memory storage API. This allows using either zbud or + zsmalloc. - See Documentation/vm/soft-dirty.txt for more details. +config ZBUD + tristate "Low density storage for compressed pages" + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. config ZSMALLOC - bool "Memory allocator for compressed pages" + tristate "Memory allocator for compressed pages" depends on MMU default n help @@ -577,3 +592,21 @@ config PGTABLE_MAPPING You can check speed with zsmalloc benchmark: https://github.com/spartacus06/zsmapbench + +config GENERIC_EARLY_IOREMAP + bool + +config MAX_STACK_SIZE_MB + int "Maximum user stack size for 32-bit processes (MB)" + default 80 + range 8 256 if METAG + range 8 2048 + depends on STACK_GROWSUP && (!64BIT || COMPAT) + help + This is the maximum stack size in Megabytes in the VM layout of 32-bit + user processes when the stack grows upwards (currently only on parisc + and metag arch). The stack will be located at the highest memory + address minus the given value, unless the RLIMIT_STACK hard limit is + changed to a smaller value in which case that is used. + + A sane initial value is 80 MB. diff --git a/mm/Makefile b/mm/Makefile index 310c90a0926..fe7a053c0f4 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o @@ -11,13 +11,14 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif -obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := filemap.o mempool.o oom_kill.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o balloon_compaction.o \ - interval_tree.o list_lru.o $(mmu-y) + compaction.o balloon_compaction.o vmacache.o \ + interval_tree.o list_lru.o workingset.o \ + iov_iter.o $(mmu-y) obj-y += init-mm.o @@ -27,9 +28,11 @@ else obj-y += bootmem.o endif +ifdef CONFIG_MMU + obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o madvise.o +endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o @@ -59,5 +62,8 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o +obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o +obj-$(CONFIG_CMA) += cma.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ce682f7a4f2..1706cbbdf5f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -288,13 +288,19 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) * Note, we wouldn't bother setting up the timer, but this function is on the * fast-path (used by '__mark_inode_dirty()'), so we save few context switches * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). */ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -307,9 +313,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); - - /* bdi_list is now unused, clear it to mark @bdi dying */ - INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -360,6 +363,11 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) */ bdi_remove_from_list(bdi); + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + clear_bit(BDI_registered, &bdi->state); + spin_unlock_bh(&bdi->wb_lock); + /* * Drain work list and shutdown the delayed_work. At this point, * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi @@ -549,7 +557,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) bit = sync ? BDI_sync_congested : BDI_async_congested; if (test_and_clear_bit(bit, &bdi->state)) atomic_dec(&nr_bdi_congested[sync]); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } diff --git a/mm/bounce.c b/mm/bounce.c deleted file mode 100644 index 523918b8c6d..00000000000 --- a/mm/bounce.c +++ /dev/null @@ -1,287 +0,0 @@ -/* bounce buffer handling for block devices - * - * - Split from highmem.c - */ - -#include <linux/mm.h> -#include <linux/export.h> -#include <linux/swap.h> -#include <linux/gfp.h> -#include <linux/bio.h> -#include <linux/pagemap.h> -#include <linux/mempool.h> -#include <linux/blkdev.h> -#include <linux/init.h> -#include <linux/hash.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <asm/tlbflush.h> - -#include <trace/events/block.h> - -#define POOL_SIZE 64 -#define ISA_POOL_SIZE 16 - -static mempool_t *page_pool, *isa_page_pool; - -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) -static __init int init_emergency_pool(void) -{ -#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) - if (max_pfn <= max_low_pfn) - return 0; -#endif - - page_pool = mempool_create_page_pool(POOL_SIZE, 0); - BUG_ON(!page_pool); - printk("bounce pool size: %d pages\n", POOL_SIZE); - - return 0; -} - -__initcall(init_emergency_pool); -#endif - -#ifdef CONFIG_HIGHMEM -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned long flags; - unsigned char *vto; - - local_irq_save(flags); - vto = kmap_atomic(to->bv_page); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto); - local_irq_restore(flags); -} - -#else /* CONFIG_HIGHMEM */ - -#define bounce_copy_vec(to, vfrom) \ - memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) - -#endif /* CONFIG_HIGHMEM */ - -/* - * allocate pages in the DMA region for the ISA pool - */ -static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) -{ - return mempool_alloc_pages(gfp_mask | GFP_DMA, data); -} - -/* - * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA - * as the max address, so check if the pool has already been created. - */ -int init_emergency_isa_pool(void) -{ - if (isa_page_pool) - return 0; - - isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, - mempool_free_pages, (void *) 0); - BUG_ON(!isa_page_pool); - - printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); - return 0; -} - -/* - * Simple bounce buffer support for highmem pages. Depending on the - * queue gfp mask set, *to may or may not be a highmem page. kmap it - * always, it will do the Right Thing - */ -static void copy_to_high_bio_irq(struct bio *to, struct bio *from) -{ - unsigned char *vfrom; - struct bio_vec tovec, *fromvec = from->bi_io_vec; - struct bvec_iter iter; - - bio_for_each_segment(tovec, to, iter) { - if (tovec.bv_page != fromvec->bv_page) { - /* - * fromvec->bv_offset and fromvec->bv_len might have - * been modified by the block layer, so use the original - * copy, bounce_copy_vec already uses tovec->bv_len - */ - vfrom = page_address(fromvec->bv_page) + - tovec.bv_offset; - - bounce_copy_vec(&tovec, vfrom); - flush_dcache_page(tovec.bv_page); - } - - fromvec++; - } -} - -static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, *org_vec; - int i; - - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); - - /* - * free up bounce indirect pages used - */ - bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i; - if (bvec->bv_page == org_vec->bv_page) - continue; - - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); - } - - bio_endio(bio_orig, err); - bio_put(bio); -} - -static void bounce_end_io_write(struct bio *bio, int err) -{ - bounce_end_io(bio, page_pool, err); -} - -static void bounce_end_io_write_isa(struct bio *bio, int err) -{ - - bounce_end_io(bio, isa_page_pool, err); -} - -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - copy_to_high_bio_irq(bio_orig, bio); - - bounce_end_io(bio, pool, err); -} - -static void bounce_end_io_read(struct bio *bio, int err) -{ - __bounce_end_io_read(bio, page_pool, err); -} - -static void bounce_end_io_read_isa(struct bio *bio, int err) -{ - __bounce_end_io_read(bio, isa_page_pool, err); -} - -#ifdef CONFIG_NEED_BOUNCE_POOL -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) -{ - if (bio_data_dir(bio) != WRITE) - return 0; - - if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) - return 0; - - return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); -} -#else -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) -{ - return 0; -} -#endif /* CONFIG_NEED_BOUNCE_POOL */ - -static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool, int force) -{ - struct bio *bio; - int rw = bio_data_dir(*bio_orig); - struct bio_vec *to, from; - struct bvec_iter iter; - unsigned i; - - if (force) - goto bounce; - bio_for_each_segment(from, *bio_orig, iter) - if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) - goto bounce; - - return; -bounce: - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); - - bio_for_each_segment_all(to, bio, i) { - struct page *page = to->bv_page; - - if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) - continue; - - inc_zone_page_state(to->bv_page, NR_BOUNCE); - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - - if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(page); - - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap_atomic(page) + to->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap_atomic(vfrom); - } - } - - trace_block_bio_bounce(q, *bio_orig); - - bio->bi_flags |= (1 << BIO_BOUNCED); - - if (pool == page_pool) { - bio->bi_end_io = bounce_end_io_write; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - } else { - bio->bi_end_io = bounce_end_io_write_isa; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read_isa; - } - - bio->bi_private = *bio_orig; - *bio_orig = bio; -} - -void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) -{ - int must_bounce; - mempool_t *pool; - - /* - * Data-less bio, nothing to bounce - */ - if (!bio_has_data(*bio_orig)) - return; - - must_bounce = must_snapshot_stable_pages(q, *bio_orig); - - /* - * for non-isa bounce case, just check if the bounce pfn is equal - * to or bigger than the highest pfn in the system -- in that case, - * don't waste time iterating over bio segments - */ - if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) - return; - pool = page_pool; - } else { - BUG_ON(!isa_page_pool); - pool = isa_page_pool; - } - - /* - * slow path - */ - __blk_queue_bounce(q, bio_orig, pool, must_bounce); -} - -EXPORT_SYMBOL(blk_queue_bounce); diff --git a/mm/cma.c b/mm/cma.c new file mode 100644 index 00000000000..c17751c0dca --- /dev/null +++ b/mm/cma.c @@ -0,0 +1,335 @@ +/* + * Contiguous Memory Allocator + * + * Copyright (c) 2010-2011 by Samsung Electronics. + * Copyright IBM Corporation, 2013 + * Copyright LG Electronics Inc., 2014 + * Written by: + * Marek Szyprowski <m.szyprowski@samsung.com> + * Michal Nazarewicz <mina86@mina86.com> + * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * Joonsoo Kim <iamjoonsoo.kim@lge.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include <linux/memblock.h> +#include <linux/err.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/log2.h> +#include <linux/cma.h> + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +}; + +static struct cma cma_areas[MAX_CMA_AREAS]; +static unsigned cma_area_count; +static DEFINE_MUTEX(cma_mutex); + +phys_addr_t cma_get_base(struct cma *cma) +{ + return PFN_PHYS(cma->base_pfn); +} + +unsigned long cma_get_size(struct cma *cma) +{ + return cma->count << PAGE_SHIFT; +} + +static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +{ + return (1UL << (align_order >> cma->order_per_bit)) - 1; +} + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, + unsigned long pages) +{ + return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; +} + +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +{ + unsigned long bitmap_no, bitmap_count; + + bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + mutex_lock(&cma->lock); + bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + mutex_unlock(&cma->lock); +} + +static int __init cma_activate_area(struct cma *cma) +{ + int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); + unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; + unsigned i = cma->count >> pageblock_order; + struct zone *zone; + + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + + if (!cma->bitmap) + return -ENOMEM; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + + do { + unsigned j; + + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ + if (page_zone(pfn_to_page(pfn)) != zone) + goto err; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + + mutex_init(&cma->lock); + return 0; + +err: + kfree(cma->bitmap); + return -EINVAL; +} + +static int __init cma_init_reserved_areas(void) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = cma_activate_area(&cma_areas[i]); + + if (ret) + return ret; + } + + return 0; +} +core_initcall(cma_init_reserved_areas); + +/** + * cma_declare_contiguous() - reserve custom contiguous area + * @base: Base address of the reserved area optional, use 0 for any + * @size: Size of the reserved area (in bytes), + * @limit: End address of the reserved memory (optional, 0 for any). + * @alignment: Alignment for the CMA area, should be power of 2 or zero + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @fixed: hint about where to place the reserved area + * @res_cma: Pointer to store the created cma region. + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, struct cma **res_cma) +{ + struct cma *cma; + int ret = 0; + + pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", + __func__, (unsigned long)size, (unsigned long)base, + (unsigned long)limit, (unsigned long)alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + /* + * Sanitise input arguments. + * Pages both ends in CMA area could be merged into adjacent unmovable + * migratetype page by page allocator's buddy algorithm. In the case, + * you couldn't get a contiguous memory, which is not what we want. + */ + alignment = max(alignment, + (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + /* Reserve memory */ + if (base && fixed) { + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + ret = -EBUSY; + goto err; + } + } else { + phys_addr_t addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } else { + base = addr; + } + } + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma = &cma_areas[cma_area_count]; + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + + pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, + (unsigned long)base); + return 0; + +err: + pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return ret; +} + +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +{ + unsigned long mask, pfn, start = 0; + unsigned long bitmap_maxno, bitmap_no, bitmap_count; + struct page *page = NULL; + int ret; + + if (!cma || !cma->count) + return NULL; + + pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) + return NULL; + + mask = cma_bitmap_aligned_mask(cma, align); + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + for (;;) { + mutex_lock(&cma->lock); + bitmap_no = bitmap_find_next_zero_area(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask); + if (bitmap_no >= bitmap_maxno) { + mutex_unlock(&cma->lock); + break; + } + bitmap_set(cma->bitmap, bitmap_no, bitmap_count); + /* + * It's safe to drop the lock here. We've marked this region for + * our exclusive use. If the migration fails we will take the + * lock again and unmark it. + */ + mutex_unlock(&cma->lock); + + pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); + mutex_lock(&cma_mutex); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + mutex_unlock(&cma_mutex); + if (ret == 0) { + page = pfn_to_page(pfn); + break; + } + + cma_clear_bitmap(cma, pfn, count); + if (ret != -EBUSY) + break; + + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = bitmap_no + mask + 1; + } + + pr_debug("%s(): returned %p\n", __func__, page); + return page; +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, struct page *pages, int count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pr_debug("%s(page %p)\n", __func__, (void *)pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return false; + + VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); + + free_contig_range(pfn, count); + cma_clear_bitmap(cma, pfn, count); + + return true; +} diff --git a/mm/compaction.c b/mm/compaction.c index 918577595ea..21bf292b642 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; - zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = start_pfn; + zone->compact_cached_migrate_pfn[1] = start_pfn; zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_flush = false; @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { struct zone *zone = cc->zone; + unsigned long pfn; if (cc->ignore_skip_hint) return; @@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc, if (!page) return; - if (!nr_isolated) { - unsigned long pfn = page_to_pfn(page); + if (nr_isolated) + return; + + /* + * Only skip pageblocks when all forms of compaction will be known to + * fail in the near future. + */ + if (set_unsuitable) set_pageblock_skip(page); - /* Update where compaction should restart */ - if (migrate_scanner) { - if (!cc->finished_update_migrate && - pfn > zone->compact_cached_migrate_pfn) - zone->compact_cached_migrate_pfn = pfn; - } else { - if (!cc->finished_update_free && - pfn < zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = pfn; - } + pfn = page_to_pfn(page); + + /* Update where async and sync compaction should restart */ + if (migrate_scanner) { + if (cc->finished_update_migrate) + return; + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; + } else { + if (cc->finished_update_free) + return; + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; } } #else @@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc, static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ @@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, } /* async aborts if taking too long or contended */ - if (!cc->sync) { + if (cc->mode == MIGRATE_ASYNC) { cc->contended = true; return false; } @@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } -static inline bool compact_trylock_irqsave(spinlock_t *lock, - unsigned long *flags, struct compact_control *cc) +/* + * Aside from avoiding lock contention, compaction also periodically checks + * need_resched() and either schedules in sync compaction or aborts async + * compaction. This is similar to what compact_checklock_irqsave() does, but + * is used where no lock is concerned. + * + * Returns false when no scheduling was needed, or sync compaction scheduled. + * Returns true when async compaction should abort. + */ +static inline bool compact_should_abort(struct compact_control *cc) { - return compact_checklock_irqsave(lock, flags, false, cc); + /* async compaction aborts if contended */ + if (need_resched()) { + if (cc->mode == MIGRATE_ASYNC) { + cc->contended = true; + return true; + } + + cond_resched(); + } + + return false; } /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct page *page) { - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_RESERVE) - return false; - - if (is_migrate_isolate(migratetype)) - return false; - - /* If the page is a large free page, then allow migration */ + /* If the page is a large free page, then disallow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; + return false; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(migratetype)) + if (migrate_async_suitable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ @@ -253,6 +276,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, struct page *cursor, *valid_page = NULL; unsigned long flags; bool locked = false; + bool checked_pageblock = false; cursor = pfn_to_page(blockpfn); @@ -284,8 +308,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; /* Recheck this is a suitable migration target under lock */ - if (!strict && !suitable_migration_target(page)) - break; + if (!strict && !checked_pageblock) { + /* + * We need to check suitability of pageblock only once + * and this isolate_freepages_block() is called with + * pageblock range, so just check once is sufficient. + */ + checked_pageblock = true; + if (!suitable_migration_target(page)) + break; + } /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) @@ -329,7 +361,8 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, false); + update_pageblock_skip(cc, valid_page, total_isolated, true, + false); count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) @@ -460,12 +493,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; - isolate_mode_t mode = 0; struct lruvec *lruvec; unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; - bool skipped_async_unsuitable = false; + bool set_unsuitable = true; + const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? + ISOLATE_ASYNC_MIGRATE : 0) | + (unevictable ? ISOLATE_UNEVICTABLE : 0); /* * Ensure that there are not too many pages isolated from the LRU @@ -474,7 +509,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (!cc->sync) + if (cc->mode == MIGRATE_ASYNC) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -483,11 +518,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, return 0; } + if (compact_should_abort(cc)) + return 0; + /* Time to isolate some pages for migration */ - cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { /* give a chance to irqs before checking need_resched() */ - if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { + if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { if (should_release_lock(&zone->lru_lock)) { spin_unlock_irqrestore(&zone->lru_lock, flags); locked = false; @@ -526,8 +563,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* If isolation recently failed, do not retry */ pageblock_nr = low_pfn >> pageblock_order; - if (!isolation_suitable(cc, page)) - goto next_pageblock; + if (last_pageblock_nr != pageblock_nr) { + int mt; + + last_pageblock_nr = pageblock_nr; + if (!isolation_suitable(cc, page)) + goto next_pageblock; + + /* + * For async migration, also only scan in MOVABLE + * blocks. Async migration is optimistic to see if + * the minimum amount of work satisfies the allocation + */ + mt = get_pageblock_migratetype(page); + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(mt)) { + set_unsuitable = false; + goto next_pageblock; + } + } /* * Skip if free. page_order cannot be used without zone->lock @@ -537,18 +591,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; /* - * For async migration, also only scan in MOVABLE blocks. Async - * migration is optimistic to see if the minimum amount of work - * satisfies the allocation - */ - if (!cc->sync && last_pageblock_nr != pageblock_nr && - !migrate_async_suitable(get_pageblock_migratetype(page))) { - cc->finished_update_migrate = true; - skipped_async_unsuitable = true; - goto next_pageblock; - } - - /* * Check may be lockless but that's ok as we recheck later. * It's possible to migrate LRU pages and balloon pages * Skip any other type of page @@ -557,11 +599,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (unlikely(balloon_page_movable(page))) { if (locked && balloon_page_isolate(page)) { /* Successfully isolated */ - cc->finished_update_migrate = true; - list_add(&page->lru, migratelist); - cc->nr_migratepages++; - nr_isolated++; - goto check_compact_cluster; + goto isolate_success; } } continue; @@ -584,6 +622,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } + /* + * Migration will fail if an anonymous page is pinned in memory, + * so avoid taking lru_lock and isolating it unnecessarily in an + * admittedly racy check. + */ + if (!page_mapping(page) && + page_count(page) > page_mapcount(page)) + continue; + /* Check if it is ok to still hold the lock */ locked = compact_checklock_irqsave(&zone->lru_lock, &flags, locked, cc); @@ -598,12 +645,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (!cc->sync) - mode |= ISOLATE_ASYNC_MIGRATE; - - if (unevictable) - mode |= ISOLATE_UNEVICTABLE; - lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ @@ -613,13 +654,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, VM_BUG_ON_PAGE(PageTransCompound(page), page); /* Successfully isolated */ - cc->finished_update_migrate = true; del_page_from_lru_list(page, lruvec, page_lru(page)); + +isolate_success: + cc->finished_update_migrate = true; list_add(&page->lru, migratelist); cc->nr_migratepages++; nr_isolated++; -check_compact_cluster: /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; @@ -630,7 +672,6 @@ check_compact_cluster: next_pageblock: low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; } acct_isolated(zone, locked, cc); @@ -641,11 +682,10 @@ next_pageblock: /* * Update the pageblock-skip information and cached scanner pfn, * if the whole pageblock was scanned without isolating any page. - * This is not done when pageblock was skipped due to being unsuitable - * for async compaction, so that eventual sync compaction can try. */ - if (low_pfn == end_pfn && !skipped_async_unsuitable) - update_pageblock_skip(cc, valid_page, nr_isolated, true); + if (low_pfn == end_pfn) + update_pageblock_skip(cc, valid_page, nr_isolated, + set_unsuitable, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -666,44 +706,48 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn; + unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long block_end_pfn; /* end of current pageblock */ + unsigned long low_pfn; /* lowest pfn scanner is able to scan */ int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; /* * Initialise the free scanner. The starting point is where we last - * scanned from (or the end of the zone if starting). The low point - * is the end of the pageblock the migration scanner is using. + * successfully isolated from, zone-cached value, or the end of the + * zone when isolating for the first time. We need this aligned to + * the pageblock boundary, because we do + * block_start_pfn -= pageblock_nr_pages in the for loop. + * For ending point, take care when isolating in last pageblock of a + * a zone which ends in the middle of a pageblock. + * The low boundary is the end of the pageblock the migration scanner + * is using. */ - pfn = cc->free_pfn; + block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_end_pfn = min(block_start_pfn + pageblock_nr_pages, + zone_end_pfn(zone)); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* - * Take care that if the migration scanner is at the end of the zone - * that the free scanner does not accidentally move to the next zone - * in the next isolation cycle. - */ - high_pfn = min(low_pfn, pfn); - - z_end_pfn = zone_end_pfn(zone); - - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; - pfn -= pageblock_nr_pages) { + for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages) { unsigned long isolated; /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need - * to schedule. + * to schedule, or even abort async compaction. */ - cond_resched(); + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; - if (!pfn_valid(pfn)) + if (!pfn_valid(block_start_pfn)) continue; /* @@ -713,7 +757,7 @@ static void isolate_freepages(struct zone *zone, * i.e. it's possible that all pages within a zones range of * pages do not belong to a single zone. */ - page = pfn_to_page(pfn); + page = pfn_to_page(block_start_pfn); if (page_zone(page) != zone) continue; @@ -726,29 +770,26 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ - isolated = 0; + cc->free_pfn = block_start_pfn; + isolated = isolate_freepages_block(cc, block_start_pfn, + block_end_pfn, freelist, false); + nr_freepages += isolated; /* - * As pfn may not start aligned, pfn+pageblock_nr_page - * may cross a MAX_ORDER_NR_PAGES boundary and miss - * a pfn_valid check. Ensure isolate_freepages_block() - * only scans within a pageblock + * Set a flag that we successfully isolated in this pageblock. + * In the next loop iteration, zone->compact_cached_free_pfn + * will not be updated and thus it will effectively contain the + * highest pageblock we isolated pages from. */ - end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - end_pfn = min(end_pfn, z_end_pfn); - isolated = isolate_freepages_block(cc, pfn, end_pfn, - freelist, false); - nr_freepages += isolated; + if (isolated) + cc->finished_update_free = true; /* - * Record the highest PFN we isolated pages from. When next - * looking for free pages, the search will restart here as - * page migration may have returned some pages to the allocator + * isolate_freepages_block() might have aborted due to async + * compaction being contended */ - if (isolated) { - cc->finished_update_free = true; - high_pfn = max(high_pfn, pfn); - } + if (cc->contended) + break; } /* split_free_page does not map the pages */ @@ -758,10 +799,9 @@ static void isolate_freepages(struct zone *zone, * If we crossed the migrate scanner, we want to keep it that way * so that compact_finished() may detect this */ - if (pfn < low_pfn) - cc->free_pfn = max(pfn, zone->zone_start_pfn); - else - cc->free_pfn = high_pfn; + if (block_start_pfn < low_pfn) + cc->free_pfn = cc->migrate_pfn; + cc->nr_freepages = nr_freepages; } @@ -776,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage, struct compact_control *cc = (struct compact_control *)data; struct page *freepage; - /* Isolate free pages if necessary */ + /* + * Isolate free pages if necessary, and if we are not aborting due to + * contention. + */ if (list_empty(&cc->freepages)) { - isolate_freepages(cc->zone, cc); + if (!cc->contended) + isolate_freepages(cc->zone, cc); if (list_empty(&cc->freepages)) return NULL; @@ -792,23 +836,16 @@ static struct page *compaction_alloc(struct page *migratepage, } /* - * We cannot control nr_migratepages and nr_freepages fully when migration is - * running as migrate_pages() has no knowledge of compact_control. When - * migration is complete, we count the number of pages on the lists by hand. + * This is a migrate-callback that "frees" freepages back to the isolated + * freelist. All pages on the freelist are from the same zone, so there is no + * special handling needed for NUMA. */ -static void update_nr_listpages(struct compact_control *cc) +static void compaction_free(struct page *page, unsigned long data) { - int nr_migratepages = 0; - int nr_freepages = 0; - struct page *page; - - list_for_each_entry(page, &cc->migratepages, lru) - nr_migratepages++; - list_for_each_entry(page, &cc->freepages, lru) - nr_freepages++; + struct compact_control *cc = (struct compact_control *)data; - cc->nr_migratepages = nr_migratepages; - cc->nr_freepages = nr_freepages; + list_add(&page->lru, &cc->freepages); + cc->nr_freepages++; } /* possible outcome of isolate_migratepages */ @@ -855,13 +892,14 @@ static int compact_finished(struct zone *zone, unsigned int order; unsigned long watermark; - if (fatal_signal_pending(current)) + if (cc->contended || fatal_signal_pending(current)) return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { /* Let the next compaction start anew. */ - zone->compact_cached_migrate_pfn = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = zone_end_pfn(zone); /* @@ -961,6 +999,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + const bool sync = cc->mode != MIGRATE_ASYNC; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -986,7 +1025,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); @@ -994,7 +1033,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); @@ -1002,7 +1042,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { - unsigned long nr_migrate, nr_remaining; int err; switch (isolate_migratepages(zone, cc)) { @@ -1017,21 +1056,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - nr_migrate = cc->nr_migratepages; + if (!cc->nr_migratepages) + continue; + err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); - update_nr_listpages(cc); - nr_remaining = cc->nr_migratepages; - trace_mm_compaction_migratepages(nr_migrate - nr_remaining, - nr_remaining); + trace_mm_compaction_migratepages(cc->nr_migratepages, err, + &cc->migratepages); - /* Release isolated pages not migrated */ + /* All pages were either migrated or will be released */ + cc->nr_migratepages = 0; if (err) { putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it @@ -1053,9 +1091,8 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync, bool *contended) +static unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, enum migrate_mode mode, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1064,7 +1101,7 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .sync = sync, + .mode = mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1086,7 +1123,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from - * @sync: Whether migration is synchronous or not + * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that is true if compaction was aborted due to lock contention * @page: Optionally capture a free page of the requested order during compaction * @@ -1094,7 +1131,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + enum migrate_mode mode, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1119,7 +1156,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, + status = compact_zone_order(zone, order, gfp_mask, mode, contended); rc = max(status, rc); @@ -1158,9 +1195,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (zone_watermark_ok(zone, cc->order, low_wmark_pages(zone), 0, 0)) compaction_defer_reset(zone, cc->order, false); - /* Currently async compaction is never deferred. */ - else if (cc->sync) - defer_compaction(zone, cc->order); } VM_BUG_ON(!list_empty(&cc->freepages)); @@ -1172,7 +1206,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .sync = false, + .mode = MIGRATE_ASYNC, }; if (!order) @@ -1185,7 +1219,8 @@ static void compact_node(int nid) { struct compact_control cc = { .order = -1, - .sync = true, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, }; __compact_pgdat(NODE_DATA(nid), &cc); @@ -1225,7 +1260,7 @@ int sysctl_extfrag_handler(struct ctl_table *table, int write, } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) -ssize_t sysfs_compact_node(struct device *dev, +static ssize_t sysfs_compact_node(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { diff --git a/mm/dmapool.c b/mm/dmapool.c index c69781e97cf..ba8019b063e 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -170,24 +170,16 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->boundary = boundary; retval->allocation = allocation; - if (dev) { - int ret; + INIT_LIST_HEAD(&retval->pools); - mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools)) - ret = device_create_file(dev, &dev_attr_pools); - else - ret = 0; - /* note: not currently insisting "name" be unique */ - if (!ret) - list_add(&retval->pools, &dev->dma_pools); - else { - kfree(retval); - retval = NULL; - } - mutex_unlock(&pools_lock); + mutex_lock(&pools_lock); + if (list_empty(&dev->dma_pools) && + device_create_file(dev, &dev_attr_pools)) { + kfree(retval); + retval = NULL; } else - INIT_LIST_HEAD(&retval->pools); + list_add(&retval->pools, &dev->dma_pools); + mutex_unlock(&pools_lock); return retval; } @@ -341,10 +333,10 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, continue; if (pool->dev) dev_err(pool->dev, - "dma_pool_alloc %s, %p (corruped)\n", + "dma_pool_alloc %s, %p (corrupted)\n", pool->name, retval); else - pr_err("dma_pool_alloc %s, %p (corruped)\n", + pr_err("dma_pool_alloc %s, %p (corrupted)\n", pool->name, retval); /* @@ -508,7 +500,6 @@ void dmam_pool_destroy(struct dma_pool *pool) { struct device *dev = pool->dev; - WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); - dma_pool_destroy(pool); + WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool)); } EXPORT_SYMBOL(dmam_pool_destroy); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c new file mode 100644 index 00000000000..e10ccd299d6 --- /dev/null +++ b/mm/early_ioremap.c @@ -0,0 +1,245 @@ +/* + * Provide common bits of early_ioremap() support for architectures needing + * temporary mappings during boot before ioremap() is available. + * + * This is mostly a direct copy of the x86 early_ioremap implementation. + * + * (C) Copyright 1995 1996, 2014 Linus Torvalds + * + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <asm/fixmap.h> + +#ifdef CONFIG_MMU +static int early_ioremap_debug __initdata; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static int after_paging_init __initdata; + +void __init __weak early_ioremap_shutdown(void) +{ +} + +void __init early_ioremap_reset(void) +{ + early_ioremap_shutdown(); + after_paging_init = 1; +} + +/* + * Generally, ioremap() is available after paging_init() has been called. + * Architectures wanting to allow early_ioremap after paging_init() can + * define __late_set_fixmap and __late_clear_fixmap to do the right thing. + */ +#ifndef __late_set_fixmap +static inline void __init __late_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot) +{ + BUG(); +} +#endif + +#ifndef __late_clear_fixmap +static inline void __init __late_clear_fixmap(enum fixed_addresses idx) +{ + BUG(); +} +#endif + +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + +void __init early_ioremap_setup(void) +{ + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (WARN_ON(prev_map[i])) + break; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); +} + +static int __init check_early_ioremap_leak(void) +{ + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (WARN(count, KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n" + "please boot with early_ioremap_debug and report the dmesg.\n", + count)) + return 1; + return 0; +} +late_initcall(check_early_ioremap_leak); + +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) +{ + unsigned long offset; + resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + WARN_ON(system_state != SYSTEM_BOOTING); + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n", + __func__, (u64)phys_addr, size)) + return NULL; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (WARN_ON(!size || last_addr < phys_addr)) + return NULL; + + prev_size[slot] = size; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (WARN_ON(nrpages > NR_FIX_BTMAPS)) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_set_fixmap(idx, phys_addr, prot); + else + __early_set_fixmap(idx, phys_addr, prot); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n", + __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]); + + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", + addr, size)) + return; + + if (WARN(prev_size[slot] != size, + "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot])) + return; + + WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", + addr, size, slot); + + virt_addr = (unsigned long)addr; + if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) + return; + + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR); + --idx; + --nrpages; + } + prev_map[slot] = NULL; +} + +/* Remap an IO device */ +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO); +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, + FIXMAP_PAGE_NORMAL); +} +#else /* CONFIG_MMU */ + +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void __iomem *)phys_addr; +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ +} + +#endif /* CONFIG_MMU */ + + +void __init early_memunmap(void *addr, unsigned long size) +{ + early_iounmap((__force void __iomem *)addr, size); +} diff --git a/mm/filemap.c b/mm/filemap.c index 7a13f6ac542..90effcdf948 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,8 +31,10 @@ #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ +#include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> +#include <linux/rmap.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -76,7 +78,7 @@ * ->mmap_sem * ->lock_page (access_process_vm) * - * ->i_mutex (generic_file_buffered_write) + * ->i_mutex (generic_perform_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * * bdi->wb.list_lock @@ -107,12 +109,75 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ +static void page_cache_tree_delete(struct address_space *mapping, + struct page *page, void *shadow) +{ + struct radix_tree_node *node; + unsigned long index; + unsigned int offset; + unsigned int tag; + void **slot; + + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + + if (shadow) { + mapping->nrshadows++; + /* + * Make sure the nrshadows update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } + mapping->nrpages--; + + if (!node) { + /* Clear direct pointer tags in root node */ + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; + radix_tree_replace_slot(slot, shadow); + return; + } + + /* Clear tree tags for the removed page */ + index = page->index; + offset = index & RADIX_TREE_MAP_MASK; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (test_bit(offset, node->tags[tag])) + radix_tree_tag_clear(&mapping->page_tree, index, tag); + } + + /* Delete page, swap shadow entry */ + radix_tree_replace_slot(slot, shadow); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, &node->private_list); + } +} + /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */ -void __delete_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -127,10 +192,11 @@ void __delete_from_page_cache(struct page *page) else cleancache_invalidate_page(mapping, page); - radix_tree_delete(&mapping->page_tree, page->index); + page_cache_tree_delete(mapping, page, shadow); + page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); @@ -166,9 +232,8 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (freepage) freepage(page); @@ -176,25 +241,15 @@ void delete_from_page_cache(struct page *page) } EXPORT_SYMBOL(delete_from_page_cache); -static int sleep_on_page(void *word) -{ - io_schedule(); - return 0; -} - -static int sleep_on_page_killable(void *word) -{ - sleep_on_page(word); - return fatal_signal_pending(current) ? -EINTR : 0; -} - static int filemap_check_errors(struct address_space *mapping) { int ret = 0; /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + if (test_bit(AS_ENOSPC, &mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) + if (test_bit(AS_EIO, &mapping->flags) && + test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret; } @@ -426,7 +481,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->index = offset; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(old); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -434,8 +489,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); - /* mem_cgroup codes must not be called under tree_lock */ - mem_cgroup_replace_page_cache(old, new); + mem_cgroup_migrate(old, new, true); radix_tree_preload_end(); if (freepage) freepage(old); @@ -446,32 +500,71 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -/** - * add_to_page_cache_locked - add a locked page to the pagecache - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * - * This function is used to add a page to the pagecache. It must be locked. - * This function does not add the page to the LRU. The caller must do that. - */ -int add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page, void **shadowp) { + struct radix_tree_node *node; + void **slot; + int error; + + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + if (shadowp) + *shadowp = p; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); + } + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; +} + +static int __add_to_page_cache_locked(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, + void **shadowp) +{ + int huge = PageHuge(page); + struct mem_cgroup *memcg; int error; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); - error = mem_cgroup_cache_charge(page, current->mm, - gfp_mask & GFP_RECLAIM_MASK); - if (error) - return error; + if (!huge) { + error = mem_cgroup_try_charge(page, current->mm, + gfp_mask, &memcg); + if (error) + return error; + } error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error) { - mem_cgroup_uncharge_cache_page(page); + if (!huge) + mem_cgroup_cancel_charge(page, memcg); return error; } @@ -480,33 +573,68 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->index = offset; spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); + error = page_cache_tree_insert(mapping, page, shadowp); radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); + if (!huge) + mem_cgroup_commit_charge(page, memcg, false); trace_mm_filemap_add_to_page_cache(page); return 0; err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); + if (!huge) + mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); return error; } + +/** + * add_to_page_cache_locked - add a locked page to the pagecache + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add a page to the pagecache. It must be locked. + * This function does not add the page to the LRU. The caller must do that. + */ +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + return __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, NULL); +} EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { + void *shadow = NULL; int ret; - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add_file(page); + __set_page_locked(page); + ret = __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, &shadow); + if (unlikely(ret)) + __clear_page_locked(page); + else { + /* + * The page might have been evicted from cache only + * recently, in which case it should be activated like + * any other repeatedly accessed page. + */ + if (shadow && workingset_refault(shadow)) { + SetPageActive(page); + workingset_activation(page); + } else + ClearPageActive(page); + lru_cache_add(page); + } return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); @@ -520,10 +648,10 @@ struct page *__page_cache_alloc(gfp_t gfp) if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); page = alloc_pages_exact_node(n, gfp, 0); - } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); return page; } @@ -559,7 +687,7 @@ void wait_on_page_bit(struct page *page, int bit_nr) DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, + __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); @@ -572,7 +700,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) return 0; return __wait_on_bit(page_waitqueue(page), &wait, - sleep_on_page_killable, TASK_KILLABLE); + bit_wait_io, TASK_KILLABLE); } /** @@ -609,7 +737,7 @@ void unlock_page(struct page *page) { VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -620,17 +748,51 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (TestClearPageReclaim(page)) + /* + * TestClearPageReclaim could be used here but it is an atomic + * operation and overkill in this particular case. Failing to + * shuffle a page marked for immediate reclaim is too mild to + * justify taking an atomic operation penalty at the end of + * ever page writeback. + */ + if (PageReclaim(page)) { + ClearPageReclaim(page); rotate_reclaimable_page(page); + } if (!test_clear_page_writeback(page)) BUG(); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_writeback); } EXPORT_SYMBOL(end_page_writeback); +/* + * After completing I/O on a page, call this routine to update the page + * flags appropriately + */ +void page_endio(struct page *page, int rw, int err) +{ + if (rw == READ) { + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* rw == WRITE */ + if (err) { + SetPageError(page); + if (page->mapping) + mapping_set_error(page->mapping, err); + } + end_page_writeback(page); + } +} +EXPORT_SYMBOL_GPL(page_endio); + /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock @@ -639,7 +801,7 @@ void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, + __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); @@ -649,10 +811,21 @@ int __lock_page_killable(struct page *page) DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); return __wait_on_bit_lock(page_waitqueue(page), &wait, - sleep_on_page_killable, TASK_KILLABLE); + bit_wait_io, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); +/* + * Return values: + * 1 - page is locked; mmap_sem is still held. + * 0 - page is not locked. + * mmap_sem has been released (up_read()), unless flags had both + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in + * which case mmap_sem is still held. + * + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 + * with the page locked and the mmap_sem unperturbed. + */ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { @@ -686,14 +859,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * find_get_page - find and get a page reference + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + +/** + * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index * - * Is there a pagecache struct page at the given (mapping, offset) tuple? - * If yes, increment its refcount and return it; if no, return NULL. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. + * + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. + * + * Otherwise, %NULL is returned. */ -struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; struct page *page; @@ -710,9 +970,9 @@ repeat: if (radix_tree_deref_retry(page)) goto repeat; /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so return it without - * attempting to raise page count. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. */ goto out; } @@ -734,24 +994,30 @@ out: return page; } -EXPORT_SYMBOL(find_get_page); +EXPORT_SYMBOL(find_get_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * find_lock_entry - locate, pin and lock a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. * - * Locates the desired pagecache page, locks it, increments its reference - * count and returns its address. + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. * - * Returns zero if the page was not present. find_lock_page() may sleep. + * Otherwise, %NULL is returned. + * + * find_lock_entry() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - page = find_get_page(mapping, offset); + page = find_get_entry(mapping, offset); if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ @@ -764,44 +1030,90 @@ repeat: } return page; } -EXPORT_SYMBOL(find_lock_page); +EXPORT_SYMBOL(find_lock_entry); /** - * find_or_create_page - locate or add a pagecache page - * @mapping: the page's address_space - * @index: the page's index into the mapping - * @gfp_mask: page allocation mode + * pagecache_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * @fgp_flags: PCG flags + * @cache_gfp_mask: gfp mask to use for the page cache data page allocation + * @radix_gfp_mask: gfp mask to use for radix tree node allocation + * + * Looks up the page cache slot at @mapping & @offset. + * + * PCG flags modify how the page is returned. * - * Locates a page in the pagecache. If the page is not present, a new page - * is allocated using @gfp_mask and is added to the pagecache and to the VM's - * LRU list. The returned page is locked and has its reference count - * incremented. + * FGP_ACCESSED: the page will be marked accessed + * FGP_LOCK: Page is return locked + * FGP_CREAT: If page is not present then a new page is allocated using + * @cache_gfp_mask and added to the page cache and the VM's LRU + * list. If radix tree nodes are allocated during page cache + * insertion then @radix_gfp_mask is used. The page is returned + * locked and with an increased refcount. Otherwise, %NULL is + * returned. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic - * allocation! + * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even + * if the GFP flags specified for FGP_CREAT are atomic. * - * find_or_create_page() returns the desired page's address, or zero on - * memory exhaustion. + * If there is a page cache page, it is returned with an increased refcount. */ -struct page *find_or_create_page(struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask) +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, + int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) { struct page *page; - int err; + repeat: - page = find_lock_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp_mask); + page = find_get_entry(mapping, offset); + if (radix_tree_exceptional_entry(page)) + page = NULL; + if (!page) + goto no_page; + + if (fgp_flags & FGP_LOCK) { + if (fgp_flags & FGP_NOWAIT) { + if (!trylock_page(page)) { + page_cache_release(page); + return NULL; + } + } else { + lock_page(page); + } + + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + VM_BUG_ON_PAGE(page->index != offset, page); + } + + if (page && (fgp_flags & FGP_ACCESSED)) + mark_page_accessed(page); + +no_page: + if (!page && (fgp_flags & FGP_CREAT)) { + int err; + if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + cache_gfp_mask |= __GFP_WRITE; + if (fgp_flags & FGP_NOFS) { + cache_gfp_mask &= ~__GFP_FS; + radix_gfp_mask &= ~__GFP_FS; + } + + page = __page_cache_alloc(cache_gfp_mask); if (!page) return NULL; - /* - * We want a regular kernel memory (not highmem or DMA etc) - * allocation for the radix tree nodes, but we need to honour - * the context-specific requirements the caller has asked for. - * GFP_RECLAIM_MASK collects those requirements. - */ - err = add_to_page_cache_lru(page, mapping, index, - (gfp_mask & GFP_RECLAIM_MASK)); + + if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + fgp_flags |= FGP_LOCK; + + /* Init accessed so avoid atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + __SetPageReferenced(page); + + err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -809,9 +1121,80 @@ repeat: goto repeat; } } + return page; } -EXPORT_SYMBOL(find_or_create_page); +EXPORT_SYMBOL(pagecache_get_page); + +/** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @nr_entries: The maximum number of entries + * @entries: Where the resulting entries are placed + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a group of up to + * @nr_entries entries in the mapping. The entries are placed at + * @entries. find_get_entries() takes a reference against any actual + * pages it returns. + * + * The search returns a group of mapping-contiguous page cache entries + * with ascending indexes. There may be holes in the indices due to + * not-present pages. + * + * Any shadow entries of evicted pages, or swap entries from + * shmem/tmpfs, are included in the returned array. + * + * find_get_entries() returns the number of pages and shadow entries + * which were found. + */ +unsigned find_get_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} /** * find_get_pages - gang pagecache lookup @@ -859,9 +1242,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so skip over it - - * we only reach this from invalidate_mapping_pages(). + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Skip + * over it. */ continue; } @@ -926,9 +1309,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so stop looking for - * contiguous pages. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Stop + * looking for contiguous pages. */ break; } @@ -1002,10 +1385,17 @@ repeat: goto restart; } /* - * This function is never used on a shmem/tmpfs - * mapping, so a swap entry won't be found here. + * A shadow entry of a recently evicted page. + * + * Those entries should never be tagged, but + * this tree walk is lockless and the tags are + * looked up in bulk, one radix tree node at a + * time, so there is a sizable window for page + * reclaim to evict a page we saw tagged. + * + * Skip over it. */ - BUG(); + continue; } if (!page_cache_get_speculative(page)) @@ -1031,39 +1421,6 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); -/** - * grab_cache_page_nowait - returns locked page at given index in given cache - * @mapping: target address_space - * @index: the page index - * - * Same as grab_cache_page(), but do not wait if the page is unavailable. - * This is intended for speculative data generators, where the data can - * be regenerated if the page couldn't be grabbed. This routine should - * be safe to call while holding the lock for another page. - * - * Clear __GFP_FS when allocating the page to avoid recursion into the fs - * and deadlock against the caller's locked page. - */ -struct page * -grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) -{ - struct page *page = find_get_page(mapping, index); - - if (page) { - if (trylock_page(page)) - return page; - page_cache_release(page); - return NULL; - } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { - page_cache_release(page); - page = NULL; - } - return page; -} -EXPORT_SYMBOL(grab_cache_page_nowait); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -1089,7 +1446,8 @@ static void shrink_readahead_size_eio(struct file *filp, * do_generic_file_read - generic file read routine * @filp: the file to read * @ppos: current file position - * @desc: read_descriptor + * @iter: data destination + * @written: already copied * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -1097,8 +1455,8 @@ static void shrink_readahead_size_eio(struct file *filp, * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc) +static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, + struct iov_iter *iter, ssize_t written) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -1108,12 +1466,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos, pgoff_t prev_index; unsigned long offset; /* offset into pagecache page */ unsigned int prev_offset; - int error; + int error = 0; index = *ppos >> PAGE_CACHE_SHIFT; prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); - last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; for (;;) { @@ -1148,7 +1506,7 @@ find_page: if (!page->mapping) goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, - desc, offset)) + offset, iter->count)) goto page_not_up_to_date_locked; unlock_page(page); } @@ -1198,24 +1556,23 @@ page_ok: /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The file_read_actor routine returns how many bytes were - * actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = file_read_actor(desc, page, offset, nr); + + ret = copy_page_to_iter(page, offset, nr, iter); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; prev_offset = offset; page_cache_release(page); - if (ret == nr && desc->count) - continue; - goto out; + written += ret; + if (!iov_iter_count(iter)) + goto out; + if (ret < nr) { + error = -EFAULT; + goto out; + } + continue; page_not_up_to_date: /* Get exclusive access to the page ... */ @@ -1250,6 +1607,7 @@ readpage: if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { page_cache_release(page); + error = 0; goto find_page; } goto readpage_error; @@ -1280,7 +1638,6 @@ readpage: readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ - desc->error = error; page_cache_release(page); goto out; @@ -1291,16 +1648,17 @@ no_cached_page: */ page = page_cache_alloc_cold(mapping); if (!page) { - desc->error = -ENOMEM; + error = -ENOMEM; goto out; } error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { page_cache_release(page); - if (error == -EEXIST) + if (error == -EEXIST) { + error = 0; goto find_page; - desc->error = error; + } goto out; } goto readpage; @@ -1313,130 +1671,45 @@ out: *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; file_accessed(filp); + return written ? written : error; } -int file_read_actor(read_descriptor_t *desc, struct page *page, - unsigned long offset, unsigned long size) -{ - char *kaddr; - unsigned long left, count = desc->count; - - if (size > count) - size = count; - - /* - * Faults on the destination of a read are common, so do it before - * taking the kmap. - */ - if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page); - left = __copy_to_user_inatomic(desc->arg.buf, - kaddr + offset, size); - kunmap_atomic(kaddr); - if (left == 0) - goto success; - } - - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_to_user(desc->arg.buf, kaddr + offset, size); - kunmap(page); - - if (left) { - size -= left; - desc->error = -EFAULT; - } -success: - desc->count = count - size; - desc->written += size; - desc->arg.buf += size; - return size; -} - -/* - * Performs necessary checks before doing a write - * @iov: io vector request - * @nr_segs: number of segments in the iovec - * @count: number of bytes to write - * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE - * - * Adjust number of segments and amount of bytes to write (nr_segs should be - * properly initialized first). Returns appropriate error code that caller - * should return or zero in case that write should be allowed. - */ -int generic_segment_checks(const struct iovec *iov, - unsigned long *nr_segs, size_t *count, int access_flags) -{ - unsigned long seg; - size_t cnt = 0; - for (seg = 0; seg < *nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - cnt += iv->iov_len; - if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(access_flags, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - *nr_segs = seg; - cnt -= iv->iov_len; /* This segment is no good */ - break; - } - *count = cnt; - return 0; -} -EXPORT_SYMBOL(generic_segment_checks); - /** - * generic_file_aio_read - generic filesystem read routine + * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block - * @iov: io vector request - * @nr_segs: number of segments in the iovec - * @pos: current file position + * @iter: destination for the data read * - * This is the "read()" routine for all filesystems + * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. */ ssize_t -generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg = 0; - size_t count; + struct file *file = iocb->ki_filp; + ssize_t retval = 0; loff_t *ppos = &iocb->ki_pos; - - count = 0; - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; + loff_t pos = *ppos; /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (filp->f_flags & O_DIRECT) { + if (file->f_flags & O_DIRECT) { + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); loff_t size; - struct address_space *mapping; - struct inode *inode; - mapping = filp->f_mapping; - inode = mapping->host; if (!count) goto out; /* skip atime */ size = i_size_read(inode); retval = filemap_write_and_wait_range(mapping, pos, - pos + iov_length(iov, nr_segs) - 1); + pos + count - 1); if (!retval) { - retval = mapping->a_ops->direct_IO(READ, iocb, - iov, pos, nr_segs); + struct iov_iter data = *iter; + retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos); } + if (retval > 0) { *ppos = pos + retval; - count -= retval; + iov_iter_advance(iter, retval); } /* @@ -1447,49 +1720,17 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, * and return. Otherwise fallthrough to buffered io for * the rest of the read. */ - if (retval < 0 || !count || *ppos >= size) { - file_accessed(filp); + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { + file_accessed(file); goto out; } } - count = retval; - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - loff_t offset = 0; - - /* - * If we did a short DIO read we need to skip the section of the - * iov that we've already read data into. - */ - if (count) { - if (count > iov[seg].iov_len) { - count -= iov[seg].iov_len; - continue; - } - offset = count; - count = 0; - } - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base + offset; - desc.count = iov[seg].iov_len - offset; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp, ppos, &desc); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } + retval = do_generic_file_read(file, ppos, iter, retval); out: return retval; } -EXPORT_SYMBOL(generic_file_aio_read); +EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU /** @@ -1604,6 +1845,18 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. + * + * vma->vm_mm->mmap_sem must be held on entry. + * + * If our return value has VM_FAULT_RETRY set, it's because + * lock_page_or_retry() returned 0. + * The mmap_sem has usually been released in this case. + * See __lock_page_or_retry() for the exception. + * + * If our return value does not have VM_FAULT_RETRY set, the mmap_sem + * has not been released. + * + * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -1614,11 +1867,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; - pgoff_t size; + loff_t size; int ret = 0; - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (offset >= size) + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (offset >= size >> PAGE_CACHE_SHIFT) return VM_FAULT_SIGBUS; /* @@ -1667,8 +1920,8 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(offset >= size)) { + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; @@ -1726,6 +1979,78 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct radix_tree_iter iter; + void **slot; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + loff_t size; + struct page *page; + unsigned long address = (unsigned long) vmf->virtual_address; + unsigned long addr; + pte_t *pte; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { + if (iter.index > vmf->max_pgoff) + break; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + goto next; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + break; + else + goto next; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + if (!PageUptodate(page) || + PageReadahead(page) || + PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + + if (page->mapping != mapping || !PageUptodate(page)) + goto unlock; + + size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); + if (page->index >= size >> PAGE_CACHE_SHIFT) + goto unlock; + + pte = vmf->pte + page->index - vmf->pgoff; + if (!pte_none(*pte)) + goto unlock; + + if (file->f_ra.mmap_miss > 0) + file->f_ra.mmap_miss--; + addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; + do_set_pte(vma, addr, page, pte, false, false); + unlock_page(page); + goto next; +unlock: + unlock_page(page); +skip: + page_cache_release(page); +next: + if (iter.index == vmf->max_pgoff) + break; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(filemap_map_pages); + int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; @@ -1755,6 +2080,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -1795,6 +2121,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), @@ -1821,6 +2159,8 @@ repeat: if (err < 0) { page_cache_release(page); page = ERR_PTR(err); + } else { + page = wait_on_page_read(page); } } return page; @@ -1857,6 +2197,10 @@ retry: if (err < 0) { page_cache_release(page); return ERR_PTR(err); + } else { + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; } out: mark_page_accessed(page); @@ -1864,40 +2208,25 @@ out: } /** - * read_cache_page_async - read into page cache, fill it if needed + * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read * @data: first arg to filler(data, page) function, often left as NULL * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. + * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. */ -struct page *read_cache_page_async(struct address_space *mapping, +struct page *read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); } -EXPORT_SYMBOL(read_cache_page_async); - -static struct page *wait_on_page_read(struct page *page) -{ - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - } - return page; -} +EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. @@ -1916,175 +2245,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping, { filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); + return do_read_cache_page(mapping, index, filler, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: first arg to filler(data, page) function, often left as NULL - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page then wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) -{ - return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -} -EXPORT_SYMBOL(read_cache_page); - -static size_t __iovec_copy_from_user_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Copy as much as we can into the page and return the number of bytes which - * were successfully copied. If a fault is encountered then return the number of - * bytes which were copied. - */ -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - BUG_ON(!in_atomic()); - kaddr = kmap_atomic(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap_atomic(kaddr); - - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -/* - * This has the same sideeffects and return value as - * iov_iter_copy_from_user_atomic(). - * The difference is that it attempts to resolve faults. - * Page must not be locked. - */ -size_t iov_iter_copy_from_user(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap(page); - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user); - -void iov_iter_advance(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct iovec *iov = i->iov; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !iov->iov_len)) { - int copy; - - copy = min(bytes, iov->iov_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - nr_segs--; - base = 0; - } - } - i->iov = iov; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(const struct iov_iter *i) -{ - const struct iovec *iov = i->iov; - if (i->nr_segs == 1) - return i->count; - else - return min(i->count, iov->iov_len - i->iov_offset); -} -EXPORT_SYMBOL(iov_iter_single_seg_count); - /* * Performs necessary checks before doing a write * @@ -2184,15 +2348,12 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, { const struct address_space_operations *aops = mapping->a_ops; - mark_page_accessed(page); return aops->write_end(file, mapping, pos, len, copied, page, fsdata); } EXPORT_SYMBOL(pagecache_write_end); ssize_t -generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long *nr_segs, loff_t pos, loff_t *ppos, - size_t count, size_t ocount) +generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -2200,11 +2361,9 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, ssize_t written; size_t write_len; pgoff_t end; + struct iov_iter data; - if (count != ocount) - *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - - write_len = iov_length(iov, *nr_segs); + write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); @@ -2231,7 +2390,8 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } } - written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + data = *from; + written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos); /* * Finally, try again to invalidate clean pages which might have been @@ -2248,11 +2408,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (written > 0) { pos += written; + iov_iter_advance(from, written); if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = pos; + iocb->ki_pos = pos; } out: return written; @@ -2266,39 +2427,23 @@ EXPORT_SYMBOL(generic_file_direct_write); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { - int status; - gfp_t gfp_mask; struct page *page; - gfp_t gfp_notmask = 0; + int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; - gfp_mask = mapping_gfp_mask(mapping); - if (mapping_cap_account_dirty(mapping)) - gfp_mask |= __GFP_WRITE; if (flags & AOP_FLAG_NOFS) - gfp_notmask = __GFP_FS; -repeat: - page = find_lock_page(mapping, index); + fgp_flags |= FGP_NOFS; + + page = pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping), + GFP_KERNEL); if (page) - goto found; + wait_for_stable_page(page); - page = __page_cache_alloc(gfp_mask & ~gfp_notmask); - if (!page) - return NULL; - status = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & ~gfp_notmask); - if (unlikely(status)) { - page_cache_release(page); - if (status == -EEXIST) - goto repeat; - return NULL; - } -found: - wait_for_stable_page(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); -static ssize_t generic_perform_write(struct file *file, +ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { struct address_space *mapping = file->f_mapping; @@ -2342,18 +2487,15 @@ again: status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); - if (unlikely(status)) + if (unlikely(status < 0)) break; if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - pagefault_enable(); flush_dcache_page(page); - mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) @@ -2388,34 +2530,12 @@ again: return written ? written : status; } - -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) -{ - struct file *file = iocb->ki_filp; - ssize_t status; - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, written); - status = generic_perform_write(file, &i, pos); - - if (likely(status >= 0)) { - written += status; - *ppos = pos + status; - } - - return written ? written : status; -} -EXPORT_SYMBOL(generic_file_buffered_write); +EXPORT_SYMBOL(generic_perform_write); /** - * __generic_file_aio_write - write data to a file + * __generic_file_write_iter - write data to a file * @iocb: IO state structure (file, offset, etc.) - * @iov: vector with data to write - * @nr_segs: number of segments in the vector - * @ppos: position where to write + * @from: iov_iter with data to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates @@ -2429,30 +2549,19 @@ EXPORT_SYMBOL(generic_file_buffered_write); * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_mutex. */ -ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; - size_t ocount; /* original count */ - size_t count; /* after file limit checks */ struct inode *inode = mapping->host; - loff_t pos; - ssize_t written; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; ssize_t err; - - ocount = 0; - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) - return err; - - count = ocount; - pos = *ppos; + ssize_t status; + size_t count = iov_iter_count(from); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - written = 0; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; @@ -2460,6 +2569,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; + iov_iter_truncate(from, count); + err = file_remove_suid(file); if (err) goto out; @@ -2471,42 +2582,40 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { loff_t endbyte; - ssize_t written_buffered; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - ppos, count, ocount); + written = generic_file_direct_write(iocb, from, pos); if (written < 0 || written == count) goto out; + /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ pos += written; count -= written; - written_buffered = generic_file_buffered_write(iocb, iov, - nr_segs, pos, ppos, count, - written); + + status = generic_perform_write(file, from, pos); /* - * If generic_file_buffered_write() retuned a synchronous error + * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were * direct-written, or the error code if that was zero. Note * that this differs from normal direct-io semantics, which * will return -EFOO even if some bytes were written. */ - if (written_buffered < 0) { - err = written_buffered; + if (unlikely(status < 0)) { + err = status; goto out; } - + iocb->ki_pos = pos + status; /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. */ - endbyte = pos + written_buffered - written - 1; + endbyte = pos + status - 1; err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { - written = written_buffered; + written += status; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); @@ -2517,37 +2626,33 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, */ } } else { - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + written = generic_perform_write(file, from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; } out: current->backing_dev_info = NULL; return written ? written : err; } -EXPORT_SYMBOL(__generic_file_aio_write); +EXPORT_SYMBOL(__generic_file_write_iter); /** - * generic_file_aio_write - write data to a file + * generic_file_write_iter - write data to a file * @iocb: IO state structure - * @iov: vector with data to write - * @nr_segs: number of segments in the vector - * @pos: position in file where to write + * @from: iov_iter with data to write * - * This is a wrapper around __generic_file_aio_write() to be used by most + * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_mutex as needed. */ -ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; - BUG_ON(iocb->ki_pos != pos); - mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); if (ret > 0) { @@ -2559,7 +2664,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } return ret; } -EXPORT_SYMBOL(generic_file_aio_write); +EXPORT_SYMBOL(generic_file_write_iter); /** * try_to_release_page() - release old fs-specific metadata on a page diff --git a/mm/fremap.c b/mm/fremap.c index bbc4d660221..72b8fa36143 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -23,28 +23,44 @@ #include "internal.h" +static int mm_counter(struct page *page) +{ + return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; +} + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; + struct page *page; + swp_entry_t entry; if (pte_present(pte)) { - struct page *page; - flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); if (page) { if (pte_dirty(pte)) set_page_dirty(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, mm_counter(page)); page_remove_rmap(page); page_cache_release(page); + } + } else { /* zap_pte() is not called when pte_none() */ + if (!pte_file(pte)) { update_hiwater_rss(mm); - dec_mm_counter(mm, MM_FILEPAGES); + entry = pte_to_swp_entry(pte); + if (non_swap_entry(entry)) { + if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + dec_mm_counter(mm, mm_counter(page)); + } + } else { + free_swap_and_cache(entry); + dec_mm_counter(mm, MM_SWAPENTS); + } } - } else { - if (!pte_file(pte)) - free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear_not_present_full(mm, addr, ptep, 0); } } @@ -66,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, ptfile = pgoff_to_pte(pgoff); - if (!pte_none(*pte)) { - if (pte_present(*pte) && pte_soft_dirty(*pte)) - pte_file_mksoft_dirty(ptfile); + if (!pte_none(*pte)) zap_pte(mm, vma, addr, pte); - } - set_pte_at(mm, addr, pte, ptfile); + set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a @@ -136,6 +149,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, int has_write_lock = 0; vm_flags_t vm_flags = 0; + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + if (prot) return err; /* diff --git a/mm/frontswap.c b/mm/frontswap.c index 1b24bdcb319..c30eec536f0 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); static unsigned long __frontswap_curr_pages(void) { - int type; unsigned long totalpages = 0; struct swap_info_struct *si = NULL; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + plist_for_each_entry(si, &swap_active_head, list) totalpages += atomic_read(&si->frontswap_pages); - } return totalpages; } @@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, int si_frontswap_pages; unsigned long total_pages_to_unuse = total; unsigned long pages = 0, pages_to_unuse = 0; - int type; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + plist_for_each_entry(si, &swap_active_head, list) { si_frontswap_pages = atomic_read(&si->frontswap_pages); if (total_pages_to_unuse < si_frontswap_pages) { pages = pages_to_unuse = total_pages_to_unuse; @@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, } vm_unacct_memory(pages); *unused = pages_to_unuse; - *swapid = type; + *swapid = si->type; ret = 0; break; } @@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) /* * we don't want to hold swap_lock while doing a very * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_list.head each time + * so restart scan from swap_active_head each time */ spin_lock(&swap_lock); ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); diff --git a/mm/gup.c b/mm/gup.c new file mode 100644 index 00000000000..91d044b1600 --- /dev/null +++ b/mm/gup.c @@ -0,0 +1,674 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/spinlock.h> + +#include <linux/hugetlb.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> + +#include "internal.h" + +static struct page *no_page_table(struct vm_area_struct *vma, + unsigned int flags) +{ + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. + */ + if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); + return NULL; +} + +static struct page *follow_page_pte(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + spinlock_t *ptl; + pte_t *ptep, pte; + +retry: + if (unlikely(pmd_bad(*pmd))) + return no_page_table(vma, flags); + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte) || pte_file(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto retry; + } + if ((flags & FOLL_NUMA) && pte_numa(pte)) + goto no_page; + if ((flags & FOLL_WRITE) && !pte_write(pte)) { + pte_unmap_unlock(ptep, ptl); + return NULL; + } + + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } + + if (flags & FOLL_GET) + get_page_foll(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ + mark_page_accessed(page); + } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. + */ + mlock_vma_page(page); + unlock_page(page); + } + } + pte_unmap_unlock(ptep, ptl); + return page; +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return NULL; + return no_page_table(vma, flags); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in <linux/mm.h> + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + return page; + } + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + if (flags & FOLL_GET) + return NULL; + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + return page; + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return no_page_table(vma, flags); + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else + page = NULL; + } + return page; + } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + return no_page_table(vma, flags); + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(vma, address, pmd); + return follow_page_pte(vma, address, pmd, flags); + } + ptl = pmd_lock(mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(ptl); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(vma, address, + pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + return page; + } + } else + spin_unlock(ptl); + } + return follow_page_pte(vma, address, pmd, flags); +} + +static int get_gate_page(struct mm_struct *mm, unsigned long address, + unsigned int gup_flags, struct vm_area_struct **vma, + struct page **page) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret = -EFAULT; + + /* user gate pages are read-only */ + if (gup_flags & FOLL_WRITE) + return -EFAULT; + if (address > TASK_SIZE) + pgd = pgd_offset_k(address); + else + pgd = pgd_offset_gate(mm, address); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, address); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); + pte = pte_offset_map(pmd, address); + if (pte_none(*pte)) + goto unmap; + *vma = get_gate_vma(mm); + if (!page) + goto out; + *page = vm_normal_page(*vma, address, *pte); + if (!*page) { + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + goto unmap; + *page = pte_page(*pte); + } + get_page(*page); +out: + ret = 0; +unmap: + pte_unmap(pte); + return ret; +} + +/* + * mmap_sem must be held on entry. If @nonblocking != NULL and + * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. + * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + */ +static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, + unsigned long address, unsigned int *flags, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned int fault_flags = 0; + int ret; + + /* For mlock, just skip the stack guard page. */ + if ((*flags & FOLL_MLOCK) && + (stack_guard_page_start(vma, address) || + stack_guard_page_end(vma, address + PAGE_SIZE))) + return -ENOENT; + if (*flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (*flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + return -EBUSY; + } + + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when + * necessary, even if maybe_mkwrite decided not to set pte_write. We + * can thus safely do subsequent page lookups as if they were reads. + * But only do so when looping for pte_write is futile: in some cases + * userspace may also be wanting to write to the gotten user page, + * which a read fault here might prevent (a readonly page might get + * reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) + *flags &= ~FOLL_WRITE; + return 0; +} + +static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) +{ + vm_flags_t vm_flags = vma->vm_flags; + + if (vm_flags & (VM_IO | VM_PFNMAP)) + return -EFAULT; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * We used to let the write,force case do COW in a + * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could + * set a breakpoint in a read-only mapping of an + * executable, without corrupting the file (yet only + * when that file had been opened for writing!). + * Anon pages in shared mappings are surprising: now + * just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + return -EFAULT; + } + } + } else if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * Is there actually any vma we can reach here which does not + * have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + return -EFAULT; + } + return 0; +} + +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held. It may be released. See below. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. Further, if @gup_flags does not + * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in + * this case. + * + * A caller using such a combination of @nonblocking and @gup_flags + * must therefore hold the mmap_sem for reading only, and recognize + * when it's been released. Otherwise, it must be held for either + * reading or writing and will not be released. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) +{ + long i = 0; + unsigned int page_mask; + struct vm_area_struct *vma = NULL; + + if (!nr_pages) + return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + + /* + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + + do { + struct page *page; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(mm, start)) { + int ret; + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, + pages ? &pages[i] : NULL); + if (ret) + return i ? : ret; + page_mask = 0; + goto next_page; + } + + if (!vma || check_vma_flags(vma, gup_flags)) + return i ? : -EFAULT; + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &nr_pages, i, + gup_flags); + continue; + } + } +retry: + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + cond_resched(); + page = follow_page_mask(vma, start, foll_flags, &page_mask); + if (!page) { + int ret; + ret = faultin_page(tsk, vma, start, &foll_flags, + nonblocking); + switch (ret) { + case 0: + goto retry; + case -EFAULT: + case -ENOMEM: + case -EHWPOISON: + return i ? i : ret; + case -EBUSY: + return i; + case -ENOENT: + goto next_page; + } + BUG(); + } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + flush_anon_page(vma, page, start); + flush_dcache_page(page); + page_mask = 0; + } +next_page: + if (vmas) { + vmas[i] = vma; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; + } while (nr_pages); + return i; +} +EXPORT_SYMBOL(__get_user_pages); + +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + vm_flags_t vm_flags; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* + * get_user_pages() - pin user pages in memory + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force access even when user mapping is currently + * protected (but never forces write access to shared mapping). + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) +{ + int flags = FOLL_TOUCH; + + if (pages) + flags |= FOLL_GET; + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); +} +EXPORT_SYMBOL(get_user_pages); + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ diff --git a/mm/highmem.c b/mm/highmem.c index b32b70cdaed..123bcd3ed4f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -44,6 +44,66 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx); */ #ifdef CONFIG_HIGHMEM +/* + * Architecture with aliasing data cache may define the following family of + * helper functions in its asm/highmem.h to control cache color of virtual + * addresses where physical memory pages are mapped by kmap. + */ +#ifndef get_pkmap_color + +/* + * Determine color of virtual address where the page should be mapped. + */ +static inline unsigned int get_pkmap_color(struct page *page) +{ + return 0; +} +#define get_pkmap_color get_pkmap_color + +/* + * Get next index for mapping inside PKMAP region for page with given color. + */ +static inline unsigned int get_next_pkmap_nr(unsigned int color) +{ + static unsigned int last_pkmap_nr; + + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + return last_pkmap_nr; +} + +/* + * Determine if page index inside PKMAP region (pkmap_nr) of given color + * has wrapped around PKMAP region end. When this happens an attempt to + * flush all unused PKMAP slots is made. + */ +static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color) +{ + return pkmap_nr == 0; +} + +/* + * Get the number of PKMAP entries of the given color. If no free slot is + * found after checking that many entries, kmap will sleep waiting for + * someone to call kunmap and free PKMAP slot. + */ +static inline int get_pkmap_entries_count(unsigned int color) +{ + return LAST_PKMAP; +} + +/* + * Get head of a wait queue for PKMAP entries of the given color. + * Wait queues for different mapping colors should be independent to avoid + * unnecessary wakeups caused by freeing of slots of other colors. + */ +static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) +{ + static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + + return &pkmap_map_wait; +} +#endif + unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); @@ -68,13 +128,10 @@ unsigned int nr_free_highpages (void) } static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); pte_t * pkmap_page_table; -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); - /* * Most architectures have no use for kmap_high_get(), so let's abstract * the disabling of IRQ out of the locking in that case to save on a @@ -161,15 +218,17 @@ static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; int count; + unsigned int last_pkmap_nr; + unsigned int color = get_pkmap_color(page); start: - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); /* Find an empty entry */ for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { + last_pkmap_nr = get_next_pkmap_nr(color); + if (no_more_pkmaps(last_pkmap_nr, color)) { flush_all_zero_pkmaps(); - count = LAST_PKMAP; + count = get_pkmap_entries_count(color); } if (!pkmap_count[last_pkmap_nr]) break; /* Found a usable entry */ @@ -181,12 +240,14 @@ start: */ { DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *pkmap_map_wait = + get_pkmap_wait_queue_head(color); __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); + add_wait_queue(pkmap_map_wait, &wait); unlock_kmap(); schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); + remove_wait_queue(pkmap_map_wait, &wait); lock_kmap(); /* Somebody else might have mapped it while we slept */ @@ -274,6 +335,8 @@ void kunmap_high(struct page *page) unsigned long nr; unsigned long flags; int need_wakeup; + unsigned int color = get_pkmap_color(page); + wait_queue_head_t *pkmap_map_wait; lock_kmap_any(flags); vaddr = (unsigned long)page_address(page); @@ -299,13 +362,14 @@ void kunmap_high(struct page *page) * no need for the wait-queue-head's lock. Simply * test if the queue is empty. */ - need_wakeup = waitqueue_active(&pkmap_map_wait); + pkmap_map_wait = get_pkmap_wait_queue_head(color); + need_wakeup = waitqueue_active(pkmap_map_wait); } unlock_kmap_any(flags); /* do wake-up, if needed, race-free outside of the spin lock */ if (need_wakeup) - wake_up(&pkmap_map_wait); + wake_up(pkmap_map_wait); } EXPORT_SYMBOL(kunmap_high); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1546655a2d7..f8ffd9412ec 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -5,6 +5,8 @@ * the COPYING file in the top-level directory. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/sched.h> #include <linux/highmem.h> @@ -151,8 +153,7 @@ static int start_khugepaged(void) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (unlikely(IS_ERR(khugepaged_thread))) { - printk(KERN_ERR - "khugepaged: kthread_run(khugepaged) failed\n"); + pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; } @@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { - printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); + pr_err("failed to create transparent hugepage kobject\n"); return -ENOMEM; } err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); + pr_err("failed to register transparent hugepage group\n"); goto delete_obj; } err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); + pr_err("failed to register transparent hugepage group\n"); goto remove_hp_group; } @@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str) } out: if (!ret) - printk(KERN_WARNING - "transparent_hugepage= cannot parse, ignored\n"); + pr_warn("transparent_hugepage= cannot parse, ignored\n"); return ret; } __setup("transparent_hugepage=", setup_transparent_hugepage); @@ -715,13 +715,20 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, unsigned long haddr, pmd_t *pmd, struct page *page) { + struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; VM_BUG_ON_PAGE(!PageCompound(page), page); + + if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + return VM_FAULT_OOM; + pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) + if (unlikely(!pgtable)) { + mem_cgroup_cancel_charge(page, memcg); return VM_FAULT_OOM; + } clear_huge_page(page, haddr, HPAGE_PMD_NR); /* @@ -734,7 +741,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge(page, memcg); put_page(page); pte_free(mm, pgtable); } else { @@ -742,6 +749,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -827,13 +836,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { - mem_cgroup_uncharge_page(page); put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -941,79 +944,35 @@ unlock: spin_unlock(ptl); } -static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) +/* + * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages + * during copy_user_huge_page()'s copy_page_rep(): in the case when + * the source page gets split and a tail freed before copy completes. + * Called under pmd_lock of checked pmd, so safe from splitting itself. + */ +static void get_user_huge_page(struct page *page) { - spinlock_t *ptl; - pgtable_t pgtable; - pmd_t _pmd; - struct page *page; - int i, ret = 0; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!page) { - ret |= VM_FAULT_OOM; - goto out; - } + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - put_page(page); - ret |= VM_FAULT_OOM; - goto out; + atomic_add(HPAGE_PMD_NR, &page->_count); + while (++page < endpage) + get_huge_page_tail(page); + } else { + get_page(page); } +} - clear_user_highpage(page, address); - __SetPageUptodate(page); - - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) - goto out_free_page; - - pmdp_clear_flush(vma, haddr, pmd); - /* leave pmd empty until pte is filled */ - - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); +static void put_user_huge_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - if (haddr == (address & PAGE_MASK)) { - entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); - } else { - entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); - entry = pte_mkspecial(entry); - } - pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); + while (page < endpage) + put_page(page++); + } else { + put_page(page); } - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); - spin_unlock(ptl); - put_huge_zero_page(); - inc_mm_counter(mm, MM_ANONPAGES); - - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - ret |= VM_FAULT_WRITE; -out: - return ret; -out_free_page: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - mem_cgroup_uncharge_page(page); - put_page(page); - goto out; } static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, @@ -1023,6 +982,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + struct mem_cgroup *memcg; spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; @@ -1043,20 +1003,21 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_newpage_charge(pages[i], mm, - GFP_KERNEL))) { + mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, + &memcg))) { if (pages[i]) put_page(pages[i]); - mem_cgroup_uncharge_start(); while (--i >= 0) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); ret |= VM_FAULT_OOM; goto out; } + set_page_private(pages[i], (unsigned long)memcg); } for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1085,7 +1046,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pte_t *pte, entry; entry = mk_pte(pages[i], vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); page_add_new_anon_rmap(pages[i], vma, haddr); + mem_cgroup_commit_charge(pages[i], memcg, false); + lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1109,12 +1074,12 @@ out: out_free_pages: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); goto out; } @@ -1125,6 +1090,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; + struct mem_cgroup *memcg; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -1149,7 +1115,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret |= VM_FAULT_WRITE; goto out_unlock; } - get_page(page); + get_user_huge_page(page); spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && @@ -1161,8 +1127,8 @@ alloc: if (unlikely(!new_page)) { if (!page) { - ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, - address, pmd, orig_pmd, haddr); + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); @@ -1170,17 +1136,18 @@ alloc: split_huge_page(page); ret |= VM_FAULT_FALLBACK; } - put_page(page); + put_user_huge_page(page); } count_vm_event(THP_FAULT_FALLBACK); goto out; } - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); - put_page(page); + put_user_huge_page(page); } else split_huge_page_pmd(vma, address, pmd); ret |= VM_FAULT_FALLBACK; @@ -1202,10 +1169,10 @@ alloc: spin_lock(ptl); if (page) - put_page(page); + put_user_huge_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); put_page(new_page); goto out_mn; } else { @@ -1214,6 +1181,8 @@ alloc: entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); if (!page) { @@ -1611,16 +1580,23 @@ pmd_t *page_check_address_pmd(struct page *page, enum page_check_address_pmd_flag flag, spinlock_t **ptl) { + pgd_t *pgd; + pud_t *pud; pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) return NULL; - pmd = mm_find_pmd(mm, address); - if (!pmd) + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) return NULL; + pmd = pmd_offset(pud, address); + *ptl = pmd_lock(mm, pmd); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) goto unlock; if (pmd_page(*pmd) != page) goto unlock; @@ -1718,7 +1694,7 @@ static void __split_huge_page_refcount(struct page *page, &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ - smp_mb(); + smp_mb__after_atomic(); /* * retain hwpoison flag of the poisoned tail page: @@ -1812,21 +1788,24 @@ static int __split_huge_page_map(struct page *page, if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); + if (pmd_write(*pmd)) + BUG_ON(page_mapcount(page) != 1); haddr = address; for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t *pte, entry; BUG_ON(PageCompound(page+i)); + /* + * Note that pmd_numa is not transferred deliberately + * to avoid any possibility that pte_numa leaks to + * a PROT_NONE VMA by accident. + */ entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (!pmd_write(*pmd)) entry = pte_wrprotect(entry); - else - BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); - if (pmd_numa(*pmd)) - entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1898,10 +1877,11 @@ static void __split_huge_page(struct page *page, * the newly established pmd of the child later during the * walk, to be able to set it as pmd_trans_splitting too. */ - if (mapcount != page_mapcount(page)) - printk(KERN_ERR "mapcount %d page_mapcount %d\n", - mapcount, page_mapcount(page)); - BUG_ON(mapcount != page_mapcount(page)); + if (mapcount != page_mapcount(page)) { + pr_err("mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); + BUG(); + } __split_huge_page_refcount(page, list); @@ -1912,10 +1892,11 @@ static void __split_huge_page(struct page *page, BUG_ON(is_vma_temporary_stack(vma)); mapcount2 += __split_huge_page_map(page, vma, addr); } - if (mapcount != mapcount2) - printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", - mapcount, mapcount2, page_mapcount(page)); - BUG_ON(mapcount != mapcount2); + if (mapcount != mapcount2) { + pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); + BUG(); + } } /* @@ -1966,17 +1947,22 @@ out: int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { - struct mm_struct *mm = vma->vm_mm; - switch (advice) { case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif /* * Be somewhat over-protective like KSM for now! */ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) return -EINVAL; - if (mm->def_flags & VM_NOHUGEPAGE) - return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* @@ -2263,6 +2249,30 @@ static void khugepaged_alloc_sleep(void) static int khugepaged_node_load[MAX_NUMNODES]; +static bool khugepaged_scan_abort(int nid) +{ + int i; + + /* + * If zone_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!zone_reclaim_mode) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (khugepaged_node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!khugepaged_node_load[i]) + continue; + if (node_distance(nid, i) > RECLAIM_DISTANCE) + return true; + } + return false; +} + #ifdef CONFIG_NUMA static int khugepaged_find_target_node(void) { @@ -2419,6 +2429,7 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *pmd_ptl, *pte_ptl; int isolated; unsigned long hstart, hend; + struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2429,7 +2440,8 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) return; /* @@ -2453,8 +2465,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) goto out; - if (pmd_trans_huge(*pmd)) - goto out; anon_vma_lock_write(vma->anon_vma); @@ -2518,6 +2528,8 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); @@ -2531,7 +2543,7 @@ out_up_write: return; out: - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); goto out_up_write; } @@ -2553,8 +2565,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) goto out; - if (pmd_trans_huge(*pmd)) - goto out; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2579,6 +2589,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * hit record. */ node = page_to_nid(page); + if (khugepaged_scan_abort(node)) + goto out_unmap; khugepaged_node_load[node]++; VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) @@ -2803,7 +2815,7 @@ static int khugepaged(void *none) struct mm_slot *mm_slot; set_freezable(); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(); @@ -2907,12 +2919,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, static void split_huge_page_address(struct mm_struct *mm, unsigned long address) { + pgd_t *pgd; + pud_t *pud; pmd_t *pmd; VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pmd = mm_find_pmd(mm, address); - if (!pmd) + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c01cb9fedb1..eeceeeb0901 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -13,6 +13,7 @@ #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> +#include <linux/compiler.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/bootmem.h> @@ -22,6 +23,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/page-isolation.h> +#include <linux/jhash.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -33,7 +35,6 @@ #include <linux/node.h> #include "internal.h" -const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; @@ -53,6 +54,13 @@ static unsigned long __initdata default_hstate_size; */ DEFINE_SPINLOCK(hugetlb_lock); +/* + * Serializes faults on the same logical page. This is used to + * prevent spurious OOMs when the hugepage pool is fully utilized. + */ +static int num_fault_mutexes; +static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -135,15 +143,8 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * - * The region data structures are protected by a combination of the mmap_sem - * and the hugetlb_instantiation_mutex. To access or modify a region the caller - * must either hold the mmap_sem for write, or the mmap_sem for read and - * the hugetlb_instantiation_mutex: - * - * down_write(&mm->mmap_sem); - * or - * down_read(&mm->mmap_sem); - * mutex_lock(&hugetlb_instantiation_mutex); + * The region data structures are embedded into a resv_map and + * protected by a resv_map's lock */ struct file_region { struct list_head link; @@ -151,10 +152,12 @@ struct file_region { long to; }; -static long region_add(struct list_head *head, long f, long t) +static long region_add(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg, *nrg, *trg; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -184,14 +187,18 @@ static long region_add(struct list_head *head, long f, long t) } nrg->from = f; nrg->to = t; + spin_unlock(&resv->lock); return 0; } -static long region_chg(struct list_head *head, long f, long t) +static long region_chg(struct resv_map *resv, long f, long t) { - struct file_region *rg, *nrg; + struct list_head *head = &resv->regions; + struct file_region *rg, *nrg = NULL; long chg = 0; +retry: + spin_lock(&resv->lock); /* Locate the region we are before or in. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -201,15 +208,21 @@ static long region_chg(struct list_head *head, long f, long t) * Subtle, allocate a new region at the position but make it zero * size such that we can guarantee to record the reservation. */ if (&rg->link == head || t < rg->from) { - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - list_add(&nrg->link, rg->link.prev); + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + goto retry; + } - return t - f; + list_add(&nrg->link, rg->link.prev); + chg = t - f; + goto out_nrg; } /* Round our left edge to the current segment if it encloses us. */ @@ -222,7 +235,7 @@ static long region_chg(struct list_head *head, long f, long t) if (&rg->link == head) break; if (rg->from > t) - return chg; + goto out; /* We overlap with this area, if it extends further than * us then we must extend ourselves. Account for its @@ -233,20 +246,30 @@ static long region_chg(struct list_head *head, long f, long t) } chg -= rg->to - rg->from; } + +out: + spin_unlock(&resv->lock); + /* We already know we raced and no longer need the new region */ + kfree(nrg); + return chg; +out_nrg: + spin_unlock(&resv->lock); return chg; } -static long region_truncate(struct list_head *head, long end) +static long region_truncate(struct resv_map *resv, long end) { + struct list_head *head = &resv->regions; struct file_region *rg, *trg; long chg = 0; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (end <= rg->to) break; if (&rg->link == head) - return 0; + goto out; /* If we are in the middle of a region then adjust it. */ if (end > rg->from) { @@ -263,14 +286,19 @@ static long region_truncate(struct list_head *head, long end) list_del(&rg->link); kfree(rg); } + +out: + spin_unlock(&resv->lock); return chg; } -static long region_count(struct list_head *head, long f, long t) +static long region_count(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0; + spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { long seg_from; @@ -286,6 +314,7 @@ static long region_count(struct list_head *head, long f, long t) chg += seg_to - seg_from; } + spin_unlock(&resv->lock); return chg; } @@ -376,39 +405,46 @@ static void set_vma_private_data(struct vm_area_struct *vma, vma->vm_private_data = (void *)value; } -struct resv_map { - struct kref refs; - struct list_head regions; -}; - -static struct resv_map *resv_map_alloc(void) +struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); if (!resv_map) return NULL; kref_init(&resv_map->refs); + spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); return resv_map; } -static void resv_map_release(struct kref *ref) +void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); /* Clear out any active regions before we release the map. */ - region_truncate(&resv_map->regions, 0); + region_truncate(resv_map, 0); kfree(resv_map); } +static inline struct resv_map *inode_resv_map(struct inode *inode) +{ + return inode->i_mapping->private_data; +} + static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_MAYSHARE)) + if (vma->vm_flags & VM_MAYSHARE) { + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + return inode_resv_map(inode); + + } else { return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); - return NULL; + } } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) @@ -507,7 +543,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_support(h)) + if (hugepages_treat_as_movable || hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; @@ -540,7 +576,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, goto err; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask(h), &mpol, &nodemask); @@ -562,7 +598,7 @@ retry_cpuset: } mpol_cond_put(mpol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -570,25 +606,242 @@ err: return NULL; } +/* + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. + */ +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + nid = next_node(nid, *nodes_allowed); + if (nid == MAX_NUMNODES) + nid = first_node(*nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +/* + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. + */ +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +static void destroy_compound_gigantic_page(struct page *page, + unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __ClearPageTail(p); + set_page_refcounted(p); + p->first_page = NULL; + } + + set_compound_order(page, 0); + __ClearPageHead(page); +} + +static void free_gigantic_page(struct page *page, unsigned order) +{ + free_contig_range(page_to_pfn(page), 1 << order); +} + +static int __alloc_gigantic_page(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long end_pfn = start_pfn + nr_pages; + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); +} + +static bool pfn_range_valid_gigantic(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long i, end_pfn = start_pfn + nr_pages; + struct page *page; + + for (i = start_pfn; i < end_pfn; i++) { + if (!pfn_valid(i)) + return false; + + page = pfn_to_page(i); + + if (PageReserved(page)) + return false; + + if (page_count(page) > 0) + return false; + + if (PageHuge(page)) + return false; + } + + return true; +} + +static bool zone_spans_last_pfn(const struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + unsigned long last_pfn = start_pfn + nr_pages - 1; + return zone_spans_pfn(zone, last_pfn); +} + +static struct page *alloc_gigantic_page(int nid, unsigned order) +{ + unsigned long nr_pages = 1 << order; + unsigned long ret, pfn, flags; + struct zone *z; + + z = NODE_DATA(nid)->node_zones; + for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { + spin_lock_irqsave(&z->lock, flags); + + pfn = ALIGN(z->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(z, pfn, nr_pages)) { + if (pfn_range_valid_gigantic(pfn, nr_pages)) { + /* + * We release the zone lock here because + * alloc_contig_range() will also lock the zone + * at some point. If there's an allocation + * spinning on this lock, it may win the race + * and cause alloc_contig_range() to fail... + */ + spin_unlock_irqrestore(&z->lock, flags); + ret = __alloc_gigantic_page(pfn, nr_pages); + if (!ret) + return pfn_to_page(pfn); + spin_lock_irqsave(&z->lock, flags); + } + pfn += nr_pages; + } + + spin_unlock_irqrestore(&z->lock, flags); + } + + return NULL; +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); +static void prep_compound_gigantic_page(struct page *page, unsigned long order); + +static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) +{ + struct page *page; + + page = alloc_gigantic_page(nid, huge_page_order(h)); + if (page) { + prep_compound_gigantic_page(page, huge_page_order(h)); + prep_new_huge_page(h, page, nid); + } + + return page; +} + +static int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) +{ + struct page *page = NULL; + int nr_nodes, node; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_gigantic_page_node(h, node); + if (page) + return 1; + } + + return 0; +} + +static inline bool gigantic_page_supported(void) { return true; } +#else +static inline bool gigantic_page_supported(void) { return false; } +static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void destroy_compound_gigantic_page(struct page *page, + unsigned long order) { } +static inline int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) { return 0; } +#endif + static void update_and_free_page(struct hstate *h, struct page *page) { int i; - VM_BUG_ON(h->order >= MAX_ORDER); + if (hstate_is_gigantic(h) && !gigantic_page_supported()) + return; h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | - 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1 << PG_writeback); + 1 << PG_active | 1 << PG_private | + 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); set_compound_page_dtor(page, NULL); set_page_refcounted(page); - arch_release_hugepage(page); - __free_pages(page, huge_page_order(h)); + if (hstate_is_gigantic(h)) { + destroy_compound_gigantic_page(page, huge_page_order(h)); + free_gigantic_page(page, huge_page_order(h)); + } else { + arch_release_hugepage(page); + __free_pages(page, huge_page_order(h)); + } } struct hstate *size_to_hstate(unsigned long size) @@ -602,7 +855,7 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -static void free_huge_page(struct page *page) +void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -627,7 +880,7 @@ static void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -731,9 +984,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (h->order >= MAX_ORDER) - return NULL; - page = alloc_pages_exact_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, @@ -749,79 +999,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return page; } -/* - * common helper functions for hstate_next_node_to_{alloc|free}. - * We may have allocated or freed a huge page based on a different - * nodes_allowed previously, so h->next_node_to_{alloc|free} might - * be outside of *nodes_allowed. Ensure that we use an allowed - * node for alloc or free. - */ -static int next_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); - VM_BUG_ON(nid >= MAX_NUMNODES); - - return nid; -} - -static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - if (!node_isset(nid, *nodes_allowed)) - nid = next_node_allowed(nid, nodes_allowed); - return nid; -} - -/* - * returns the previously saved node ["this node"] from which to - * allocate a persistent huge page for the pool and advance the - * next node from which to allocate, handling wrap at end of node - * mask. - */ -static int hstate_next_node_to_alloc(struct hstate *h, - nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); - h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -/* - * helper for free_pool_huge_page() - return the previously saved - * node ["this node"] from which to free a huge page. Advance the - * next node id whether or not we find a free huge page to free so - * that the next attempt to free addresses the next node. - */ -static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); - h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ - nr_nodes--) - -#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_free(hs, mask)) || 1); \ - nr_nodes--) - static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; @@ -911,6 +1088,9 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; struct hstate *h; + if (!hugepages_supported()) + return; + /* Set scan step to minimum hugepage size */ for_each_hstate(h) if (order > huge_page_order(h)) @@ -925,7 +1105,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) struct page *page; unsigned int r_nid; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return NULL; /* @@ -1118,7 +1298,7 @@ static void return_unused_surplus_pages(struct hstate *h, h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return; nr_pages = min(unused_resv_pages, h->surplus_huge_pages); @@ -1134,6 +1314,7 @@ static void return_unused_surplus_pages(struct hstate *h, while (nr_pages--) { if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) break; + cond_resched_lock(&hugetlb_lock); } } @@ -1150,45 +1331,34 @@ static void return_unused_surplus_pages(struct hstate *h, static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - return region_chg(&inode->i_mapping->private_list, - idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; + long chg; - } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv = vma_resv_map(vma); + if (!resv) return 1; - } else { - long err; - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = vma_resv_map(vma); + idx = vma_hugecache_offset(h, vma, addr); + chg = region_chg(resv, idx, idx + 1); - err = region_chg(&resv->regions, idx, idx + 1); - if (err < 0) - return err; - return 0; - } + if (vma->vm_flags & VM_MAYSHARE) + return chg; + else + return chg < 0 ? chg : 0; } static void vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - region_add(&inode->i_mapping->private_list, idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = vma_resv_map(vma); + resv = vma_resv_map(vma); + if (!resv) + return; - /* Mark this page used in the map. */ - region_add(&resv->regions, idx, idx + 1); - } + idx = vma_hugecache_offset(h, vma, addr); + region_add(resv, idx, idx + 1); } static struct page *alloc_huge_page(struct vm_area_struct *vma, @@ -1218,24 +1388,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); - if (ret) { - if (chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); - return ERR_PTR(-ENOSPC); - } + if (ret) + goto out_subpool_put; + spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); - if (!page) { - hugetlb_cgroup_uncharge_cgroup(idx, - pages_per_huge_page(h), - h_cg); - if (chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); - return ERR_PTR(-ENOSPC); - } + if (!page) + goto out_uncharge_cgroup; + spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ @@ -1247,6 +1410,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, vma_commit_reservation(h, vma, addr); return page; + +out_uncharge_cgroup: + hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); +out_subpool_put: + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); + return ERR_PTR(-ENOSPC); } /* @@ -1294,7 +1464,7 @@ found: return 1; } -static void prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); @@ -1328,7 +1498,7 @@ static void __init gather_bootmem_prealloc(void) * fix confusing memory reports from free(1) and another * side-effects, like CommitLimit going negative. */ - if (h->order > (MAX_ORDER - 1)) + if (hstate_is_gigantic(h)) adjust_managed_page_count(page, 1 << h->order); } } @@ -1338,7 +1508,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) unsigned long i; for (i = 0; i < h->max_huge_pages; ++i) { - if (h->order >= MAX_ORDER) { + if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_fresh_huge_page(h, @@ -1354,7 +1524,7 @@ static void __init hugetlb_init_hstates(void) for_each_hstate(h) { /* oversize hugepages were init'ed in early boot */ - if (h->order < MAX_ORDER) + if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } } @@ -1388,7 +1558,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, { int i; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return; for_each_node_mask(i, *nodes_allowed) { @@ -1451,7 +1621,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, { unsigned long min_count, ret; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h) && !gigantic_page_supported()) return h->max_huge_pages; /* @@ -1478,7 +1648,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(h, nodes_allowed); + if (hstate_is_gigantic(h)) + ret = alloc_fresh_gigantic_page(h, nodes_allowed); + else + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -1509,6 +1682,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, while (min_count < persistent_huge_pages(h)) { if (!free_pool_huge_page(h, nodes_allowed, 0)) break; + cond_resched_lock(&hugetlb_lock); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) @@ -1562,22 +1736,14 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } -static ssize_t nr_hugepages_store_common(bool obey_mempolicy, - struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len) { int err; - int nid; - unsigned long count; - struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = kstrtoul(buf, 10, &count); - if (err) - goto out; - - h = kobj_to_hstate(kobj, &nid); - if (h->order >= MAX_ORDER) { + if (hstate_is_gigantic(h) && !gigantic_page_supported()) { err = -EINVAL; goto out; } @@ -1612,6 +1778,23 @@ out: return err; } +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + static ssize_t nr_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1621,7 +1804,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj, static ssize_t nr_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(false, kobj, attr, buf, len); + return nr_hugepages_store_common(false, kobj, buf, len); } HSTATE_ATTR(nr_hugepages); @@ -1640,7 +1823,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(true, kobj, attr, buf, len); + return nr_hugepages_store_common(true, kobj, buf, len); } HSTATE_ATTR(nr_hugepages_mempolicy); #endif @@ -1660,7 +1843,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return -EINVAL; err = kstrtoul(buf, 10, &input); @@ -1944,16 +2127,15 @@ static void __exit hugetlb_exit(void) } kobject_put(hugepages_kobj); + kfree(htlb_fault_mutex_table); } module_exit(hugetlb_exit); static int __init hugetlb_init(void) { - /* Some platform decide whether they support huge pages at boot - * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when - * there is no such support - */ - if (HPAGE_SHIFT == 0) + int i; + + if (!hugepages_supported()) return 0; if (!size_to_hstate(default_hstate_size)) { @@ -1973,6 +2155,17 @@ static int __init hugetlb_init(void) hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); +#ifdef CONFIG_SMP + num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); +#else + num_fault_mutexes = 1; +#endif + htlb_fault_mutex_table = + kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); + BUG_ON(!htlb_fault_mutex_table); + + for (i = 0; i < num_fault_mutexes; i++) + mutex_init(&htlb_fault_mutex_table[i]); return 0; } module_init(hugetlb_init); @@ -2066,13 +2259,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; - unsigned long tmp; + unsigned long tmp = h->max_huge_pages; int ret; - tmp = h->max_huge_pages; - - if (write && h->order >= MAX_ORDER) - return -EINVAL; + if (!hugepages_supported()) + return -ENOTSUPP; table->data = &tmp; table->maxlen = sizeof(unsigned long); @@ -2080,19 +2271,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (ret) goto out; - if (write) { - NODEMASK_ALLOC(nodemask_t, nodes_allowed, - GFP_KERNEL | __GFP_NORETRY); - if (!(obey_mempolicy && - init_nodemask_of_mempolicy(nodes_allowed))) { - NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_MEMORY]; - } - h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); - - if (nodes_allowed != &node_states[N_MEMORY]) - NODEMASK_FREE(nodes_allowed); - } + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); out: return ret; } @@ -2122,9 +2303,12 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->nr_overcommit_huge_pages; - if (write && h->order >= MAX_ORDER) + if (write && hstate_is_gigantic(h)) return -EINVAL; table->data = &tmp; @@ -2147,6 +2331,8 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" @@ -2163,6 +2349,8 @@ void hugetlb_report_meminfo(struct seq_file *m) int hugetlb_report_node_meminfo(int nid, char *buf) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" @@ -2177,6 +2365,9 @@ void hugetlb_show_meminfo(void) struct hstate *h; int nid; + if (!hugepages_supported()) + return; + for_each_node_state(nid, N_MEMORY) for_each_hstate(h) pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", @@ -2251,41 +2442,30 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (resv) + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_get(&resv->refs); } -static void resv_map_put(struct vm_area_struct *vma) -{ - struct resv_map *resv = vma_resv_map(vma); - - if (!resv) - return; - kref_put(&resv->refs, resv_map_release); -} - static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); - unsigned long reserve; - unsigned long start; - unsigned long end; + unsigned long reserve, start, end; - if (resv) { - start = vma_hugecache_offset(h, vma, vma->vm_start); - end = vma_hugecache_offset(h, vma, vma->vm_end); + if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return; - reserve = (end - start) - - region_count(&resv->regions, start, end); + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); - resv_map_put(vma); + reserve = (end - start) - region_count(resv, start, end); - if (reserve) { - hugetlb_acct_memory(h, -reserve); - hugepage_subpool_put_pages(spool, reserve); - } + kref_put(&resv->refs, resv_map_release); + + if (reserve) { + hugetlb_acct_memory(h, -reserve); + hugepage_subpool_put_pages(spool, reserve); } } @@ -2336,6 +2516,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) + return 1; + else + return 0; +} + +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) + return 1; + else + return 0; +} int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -2375,7 +2580,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - if (!huge_pte_none(huge_ptep_get(src_pte))) { + entry = huge_ptep_get(src_pte); + if (huge_pte_none(entry)) { /* skip none entry */ + ; + } else if (unlikely(is_hugetlb_entry_migration(entry) || + is_hugetlb_entry_hwpoisoned(entry))) { + swp_entry_t swp_entry = pte_to_swp_entry(entry); + + if (is_write_migration_entry(swp_entry) && cow) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&swp_entry); + entry = swp_entry_to_pte(swp_entry); + set_huge_pte_at(src, addr, src_pte, entry); + } + set_huge_pte_at(dst, addr, dst_pte, entry); + } else { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_ptep_get(src_pte); @@ -2394,32 +2616,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, return ret; } -static int is_hugetlb_entry_migration(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) - return 1; - else - return 0; -} - -static int is_hugetlb_entry_hwpoisoned(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) - return 1; - else - return 0; -} - void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) @@ -2554,8 +2750,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ -static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) +static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -2594,8 +2790,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); - - return 1; } /* @@ -2610,7 +2804,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int outside_reserve = 0; + int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2640,14 +2834,14 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page table lock as buddy allocator may be called */ + /* + * Drop page table lock as buddy allocator may be called. It will + * be acquired again before returning to the caller, as expected. + */ spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { - long err = PTR_ERR(new_page); - page_cache_release(old_page); - /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -2656,28 +2850,25 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { + page_cache_release(old_page); BUG_ON(huge_pte_none(pte)); - if (unmap_ref_private(mm, vma, old_page, address)) { - BUG_ON(huge_pte_none(pte)); - spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) - goto retry_avoidcopy; - /* - * race occurs while re-acquiring page table - * lock, and our job is done. - */ - return 0; - } - WARN_ON_ONCE(1); + unmap_ref_private(mm, vma, old_page, address); + BUG_ON(huge_pte_none(pte)); + spin_lock(ptl); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page table + * lock, and our job is done. + */ + return 0; } - /* Caller expects lock to be held */ - spin_lock(ptl); - if (err == -ENOMEM) - return VM_FAULT_OOM; - else - return VM_FAULT_SIGBUS; + ret = (PTR_ERR(new_page) == -ENOMEM) ? + VM_FAULT_OOM : VM_FAULT_SIGBUS; + goto out_release_old; } /* @@ -2685,11 +2876,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { - page_cache_release(new_page); - page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto out_release_all; } copy_user_huge_page(new_page, old_page, address, vma, @@ -2699,13 +2887,14 @@ retry_avoidcopy: mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Retake the page table lock to check for racing updates * before the page tables are altered */ spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) { + if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); /* Break COW */ @@ -2719,12 +2908,13 @@ retry_avoidcopy: } spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out_release_all: page_cache_release(new_page); +out_release_old: page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return 0; + spin_lock(ptl); /* Caller expects lock to be held */ + return ret; } /* Return the pagecache page at a given address within a VMA */ @@ -2761,15 +2951,14 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, } static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags) + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; int anon_rmap = 0; - pgoff_t idx; unsigned long size; struct page *page; - struct address_space *mapping; pte_t new_pte; spinlock_t *ptl; @@ -2784,9 +2973,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); - /* * Use page lock to guard against racing truncation * before we get page_table_lock. @@ -2871,8 +3057,7 @@ retry: if (anon_rmap) { ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); - } - else + } else page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); @@ -2896,17 +3081,53 @@ backout_unlocked: goto out; } +#ifdef CONFIG_SMP +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + unsigned long key[2]; + u32 hash; + + if (vma->vm_flags & VM_SHARED) { + key[0] = (unsigned long) mapping; + key[1] = idx; + } else { + key[0] = (unsigned long) mm; + key[1] = address >> huge_page_shift(h); + } + + hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + + return hash & (num_fault_mutexes - 1); +} +#else +/* + * For uniprocesor systems we always use a single mutex, so just + * return 0 and avoid the hashing overhead. + */ +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + return 0; +} +#endif + int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep; - pte_t entry; + pte_t *ptep, entry; spinlock_t *ptl; int ret; + u32 hash; + pgoff_t idx; struct page *page = NULL; struct page *pagecache_page = NULL; - static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + struct address_space *mapping; address &= huge_page_mask(h); @@ -2925,15 +3146,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!ptep) return VM_FAULT_OOM; + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - mutex_lock(&hugetlb_instantiation_mutex); + hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); + mutex_lock(&htlb_fault_mutex_table[hash]); + entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, flags); + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); goto out_mutex; } @@ -3002,8 +3228,7 @@ out_ptl: put_page(page); out_mutex: - mutex_unlock(&hugetlb_instantiation_mutex); - + mutex_unlock(&htlb_fault_mutex_table[hash]); return ret; } @@ -3120,6 +3345,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, address, end); + mmu_notifier_invalidate_range_start(mm, start, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -3149,6 +3375,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, */ flush_tlb_range(vma, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + mmu_notifier_invalidate_range_end(mm, start, end); return pages << h->order; } @@ -3161,6 +3388,7 @@ int hugetlb_reserve_pages(struct inode *inode, long ret, chg; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); + struct resv_map *resv_map; /* * Only apply hugepage reservation if asked. At fault time, an @@ -3176,10 +3404,13 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) - chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); + if (!vma || vma->vm_flags & VM_MAYSHARE) { + resv_map = inode_resv_map(inode); + + chg = region_chg(resv_map, from, to); + + } else { + resv_map = resv_map_alloc(); if (!resv_map) return -ENOMEM; @@ -3222,20 +3453,23 @@ int hugetlb_reserve_pages(struct inode *inode, * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) - region_add(&inode->i_mapping->private_list, from, to); + region_add(resv_map, from, to); return 0; out_err: - if (vma) - resv_map_put(vma); + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_put(&resv_map->refs, resv_map_release); return ret; } void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); - long chg = region_truncate(&inode->i_mapping->private_list, offset); + struct resv_map *resv_map = inode_resv_map(inode); + long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + if (resv_map) + chg = region_truncate(resv_map, offset); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); @@ -3446,7 +3680,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* Can be overriden by architectures */ -__attribute__((weak)) struct page * +struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int write) { diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index cb00829bb46..a67c26e0f36 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -30,7 +30,6 @@ struct hugetlb_cgroup { #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -struct cgroup_subsys hugetlb_subsys __read_mostly; static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline @@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { - return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) @@ -53,7 +52,7 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { - return hugetlb_cgroup_from_css(css_parent(&h_cg->css)); + return hugetlb_cgroup_from_css(h_cg->css.parent); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) @@ -182,7 +181,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); - if (!css_tryget(&h_cg->css)) { + if (!css_tryget_online(&h_cg->css)) { rcu_read_unlock(); goto again; } @@ -218,7 +217,7 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled()) return; - VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); + lockdep_assert_held(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(page); if (unlikely(!h_cg)) return; @@ -254,15 +253,16 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, return res_counter_read_u64(&h_cg->hugepage[idx], name); } -static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) +static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { int idx, name, ret; unsigned long long val; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); - idx = MEMFILE_IDX(cft->private); - name = MEMFILE_ATTR(cft->private); + buf = strstrip(buf); + idx = MEMFILE_IDX(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_LIMIT: @@ -272,26 +272,27 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, break; } /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; + val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx])); ret = res_counter_set_limit(&h_cg->hugepage[idx], val); break; default: ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } -static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, - unsigned int event) +static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { int idx, name, ret = 0; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); - idx = MEMFILE_IDX(event); - name = MEMFILE_ATTR(event); + idx = MEMFILE_IDX(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_MAX_USAGE: @@ -304,7 +305,7 @@ static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } static char *mem_fmt(char *buf, int size, unsigned long hsize) @@ -332,7 +333,7 @@ static void __init __hugetlb_cgroup_file_init(int idx) snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; - cft->write_string = hugetlb_cgroup_write; + cft->write = hugetlb_cgroup_write; /* Add the usage file */ cft = &h->cgroup_files[1]; @@ -344,23 +345,22 @@ static void __init __hugetlb_cgroup_file_init(int idx) cft = &h->cgroup_files[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); - cft->trigger = hugetlb_cgroup_reset; + cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ cft = &h->cgroup_files[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); - cft->trigger = hugetlb_cgroup_reset; + cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ cft = &h->cgroup_files[4]; memset(cft, 0, sizeof(*cft)); - WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); - - return; + WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, + h->cgroup_files)); } void __init hugetlb_cgroup_file_init(void) @@ -402,10 +402,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) return; } -struct cgroup_subsys hugetlb_subsys = { - .name = "hugetlb", +struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, - .subsys_id = hugetlb_subsys_id, }; diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 95487c71cad..329caf56df2 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -72,8 +72,7 @@ DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { - if (hwpoison_dir) - debugfs_remove_recursive(hwpoison_dir); + debugfs_remove_recursive(hwpoison_dir); } static int pfn_inject_init(void) diff --git a/mm/internal.h b/mm/internal.h index 29e1e761f9e..a1b651b11c5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H +#include <linux/fs.h> #include <linux/mm.h> void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, @@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v) atomic_set(&page->_count, v); } +extern int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size); + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static inline unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + return __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); +} + /* * Turn a non-refcounted page (->_count == 0) into refcounted with * a count of one. @@ -119,7 +134,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ + enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool finished_update_free; /* True when the zone cached pfns are * no longer being updated @@ -129,7 +144,10 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool contended; /* True if a lock was contended */ + bool contended; /* True if a lock was contended, or + * need_resched() true during async + * compaction + */ }; unsigned long @@ -154,6 +172,11 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +static inline bool is_cow_mapping(vm_flags_t flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); @@ -169,26 +192,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } /* - * Called only in fault path, to determine if a new page is being - * mapped into a LOCKED vma. If it is, mark page as mlocked. - */ -static inline int mlocked_vma_newpage(struct vm_area_struct *vma, - struct page *page) -{ - VM_BUG_ON_PAGE(PageLRU(page), page); - - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) - return 0; - - if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, - hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); - } - return 1; -} - -/* * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); @@ -230,10 +233,6 @@ extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); #endif #else /* !CONFIG_MMU */ -static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) -{ - return 0; -} static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } @@ -248,7 +247,7 @@ static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline struct page *mem_map_offset(struct page *base, int offset) { if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return pfn_to_page(page_to_pfn(base) + offset); + return nth_page(base, offset); return base + offset; } @@ -370,5 +369,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#define ALLOC_FAIR 0x100 /* fair zone allocation */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/iov_iter.c b/mm/iov_iter.c new file mode 100644 index 00000000000..9a09f2034fc --- /dev/null +++ b/mm/iov_iter.c @@ -0,0 +1,746 @@ +#include <linux/export.h> +#include <linux/uio.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + from = kaddr + offset; + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *to; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_readable(buf, copy)) { + kaddr = kmap_atomic(page); + to = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_from_user_inatomic(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user_inatomic(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = to - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + to = kaddr + offset; + left = __copy_from_user(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + kunmap(page); +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t __iovec_copy_from_user_inatomic(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_from_user_inatomic(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) + break; + } + return copied - left; +} + +/* + * Copy as much as we can into the page and return the number of bytes which + * were successfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +static size_t copy_from_user_atomic_iovec(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap_atomic(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr); + + return copied; +} + +static void advance_iovec(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + unsigned long nr_segs = i->nr_segs; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !iov->iov_len)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + i->nr_segs = nr_segs; + } +} + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + if (!(i->type & ITER_BVEC)) { + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); + } + return 0; +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +static unsigned long alignment_iovec(const struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + unsigned long res; + size_t size = i->count; + size_t n; + + if (!size) + return 0; + + res = (unsigned long)iov->iov_base + i->iov_offset; + n = iov->iov_len - i->iov_offset; + if (n >= size) + return res | size; + size -= n; + res |= n; + while (size > (++iov)->iov_len) { + res |= (unsigned long)iov->iov_base | iov->iov_len; + size -= iov->iov_len; + } + res |= (unsigned long)iov->iov_base | size; + return res; +} + +void iov_iter_init(struct iov_iter *i, int direction, + const struct iovec *iov, unsigned long nr_segs, + size_t count) +{ + /* It will get better. Eventually... */ + if (segment_eq(get_fs(), KERNEL_DS)) + direction |= ITER_KVEC; + i->type = direction; + i->iov = iov; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_init); + +static ssize_t get_pages_iovec(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + size_t offset = i->iov_offset; + const struct iovec *iov = i->iov; + size_t len; + unsigned long addr; + int n; + int res; + + len = iov->iov_len - offset; + if (len > i->count) + len = i->count; + if (len > maxsize) + len = maxsize; + addr = (unsigned long)iov->iov_base + offset; + len += *start = addr & (PAGE_SIZE - 1); + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; + addr &= ~(PAGE_SIZE - 1); + n = (len + PAGE_SIZE - 1) / PAGE_SIZE; + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); + if (unlikely(res < 0)) + return res; + return (res == n ? len : res * PAGE_SIZE) - *start; +} + +static ssize_t get_pages_alloc_iovec(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + size_t offset = i->iov_offset; + const struct iovec *iov = i->iov; + size_t len; + unsigned long addr; + void *p; + int n; + int res; + + len = iov->iov_len - offset; + if (len > i->count) + len = i->count; + if (len > maxsize) + len = maxsize; + addr = (unsigned long)iov->iov_base + offset; + len += *start = addr & (PAGE_SIZE - 1); + addr &= ~(PAGE_SIZE - 1); + n = (len + PAGE_SIZE - 1) / PAGE_SIZE; + + p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); + if (!p) + p = vmalloc(n * sizeof(struct page *)); + if (!p) + return -ENOMEM; + + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); + if (unlikely(res < 0)) { + kvfree(p); + return res; + } + *pages = p; + return (res == n ? len : res * PAGE_SIZE) - *start; +} + +static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages) +{ + size_t offset = i->iov_offset; + size_t size = i->count; + const struct iovec *iov = i->iov; + int npages = 0; + int n; + + for (n = 0; size && n < i->nr_segs; n++, iov++) { + unsigned long addr = (unsigned long)iov->iov_base + offset; + size_t len = iov->iov_len - offset; + offset = 0; + if (unlikely(!len)) /* empty segment */ + continue; + if (len > size) + len = size; + npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE + - addr / PAGE_SIZE; + if (npages >= maxpages) /* don't bother going further */ + return maxpages; + size -= len; + offset = 0; + } + return min(npages, maxpages); +} + +static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) +{ + char *from = kmap_atomic(page); + memcpy(to, from + offset, len); + kunmap_atomic(from); +} + +static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len) +{ + char *to = kmap_atomic(page); + memcpy(to + offset, from, len); + kunmap_atomic(to); +} + +static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, wanted; + const struct bio_vec *bvec; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + bvec = i->bvec; + skip = i->iov_offset; + copy = min_t(size_t, bytes, bvec->bv_len - skip); + + kaddr = kmap_atomic(page); + from = kaddr + offset; + memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy); + skip += copy; + from += copy; + bytes -= copy; + while (bytes) { + bvec++; + copy = min(bytes, (size_t)bvec->bv_len); + memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy); + skip = copy; + from += copy; + bytes -= copy; + } + kunmap_atomic(kaddr); + if (skip == bvec->bv_len) { + bvec++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= bvec - i->bvec; + i->bvec = bvec; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, wanted; + const struct bio_vec *bvec; + void *kaddr, *to; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + bvec = i->bvec; + skip = i->iov_offset; + + kaddr = kmap_atomic(page); + + to = kaddr + offset; + + copy = min(bytes, bvec->bv_len - skip); + + memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy); + + to += copy; + skip += copy; + bytes -= copy; + + while (bytes) { + bvec++; + copy = min(bytes, (size_t)bvec->bv_len); + memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy); + skip = copy; + to += copy; + bytes -= copy; + } + kunmap_atomic(kaddr); + if (skip == bvec->bv_len) { + bvec++; + skip = 0; + } + i->count -= wanted; + i->nr_segs -= bvec - i->bvec; + i->bvec = bvec; + i->iov_offset = skip; + return wanted; +} + +static size_t copy_from_user_bvec(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t left; + const struct bio_vec *bvec; + size_t base = i->iov_offset; + + kaddr = kmap_atomic(page); + for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) { + size_t copy = min(left, bvec->bv_len - base); + if (!bvec->bv_len) + continue; + memcpy_from_page(kaddr + offset, bvec->bv_page, + bvec->bv_offset + base, copy); + offset += copy; + left -= copy; + } + kunmap_atomic(kaddr); + return bytes; +} + +static void advance_bvec(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct bio_vec *bvec = i->bvec; + size_t base = i->iov_offset; + unsigned long nr_segs = i->nr_segs; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !bvec->bv_len)) { + int copy; + + copy = min(bytes, bvec->bv_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (bvec->bv_len == base) { + bvec++; + nr_segs--; + base = 0; + } + } + i->bvec = bvec; + i->iov_offset = base; + i->nr_segs = nr_segs; + } +} + +static unsigned long alignment_bvec(const struct iov_iter *i) +{ + const struct bio_vec *bvec = i->bvec; + unsigned long res; + size_t size = i->count; + size_t n; + + if (!size) + return 0; + + res = bvec->bv_offset + i->iov_offset; + n = bvec->bv_len - i->iov_offset; + if (n >= size) + return res | size; + size -= n; + res |= n; + while (size > (++bvec)->bv_len) { + res |= bvec->bv_offset | bvec->bv_len; + size -= bvec->bv_len; + } + res |= bvec->bv_offset | size; + return res; +} + +static ssize_t get_pages_bvec(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + const struct bio_vec *bvec = i->bvec; + size_t len = bvec->bv_len - i->iov_offset; + if (len > i->count) + len = i->count; + if (len > maxsize) + len = maxsize; + /* can't be more than PAGE_SIZE */ + *start = bvec->bv_offset + i->iov_offset; + + get_page(*pages = bvec->bv_page); + + return len; +} + +static ssize_t get_pages_alloc_bvec(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + const struct bio_vec *bvec = i->bvec; + size_t len = bvec->bv_len - i->iov_offset; + if (len > i->count) + len = i->count; + if (len > maxsize) + len = maxsize; + *start = bvec->bv_offset + i->iov_offset; + + *pages = kmalloc(sizeof(struct page *), GFP_KERNEL); + if (!*pages) + return -ENOMEM; + + get_page(**pages = bvec->bv_page); + + return len; +} + +static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages) +{ + size_t offset = i->iov_offset; + size_t size = i->count; + const struct bio_vec *bvec = i->bvec; + int npages = 0; + int n; + + for (n = 0; size && n < i->nr_segs; n++, bvec++) { + size_t len = bvec->bv_len - offset; + offset = 0; + if (unlikely(!len)) /* empty segment */ + continue; + if (len > size) + len = size; + npages++; + if (npages >= maxpages) /* don't bother going further */ + return maxpages; + size -= len; + offset = 0; + } + return min(npages, maxpages); +} + +size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return copy_page_to_iter_bvec(page, offset, bytes, i); + else + return copy_page_to_iter_iovec(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_to_iter); + +size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return copy_page_from_iter_bvec(page, offset, bytes, i); + else + return copy_page_from_iter_iovec(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_from_iter); + +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + if (i->type & ITER_BVEC) + return copy_from_user_bvec(page, i, offset, bytes); + else + return copy_from_user_atomic_iovec(page, i, offset, bytes); +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +void iov_iter_advance(struct iov_iter *i, size_t size) +{ + if (i->type & ITER_BVEC) + advance_bvec(i, size); + else + advance_iovec(i, size); +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(const struct iov_iter *i) +{ + if (i->nr_segs == 1) + return i->count; + else if (i->type & ITER_BVEC) + return min(i->count, i->iov->iov_len - i->iov_offset); + else + return min(i->count, i->bvec->bv_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); + +unsigned long iov_iter_alignment(const struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return alignment_bvec(i); + else + return alignment_iovec(i); +} +EXPORT_SYMBOL(iov_iter_alignment); + +ssize_t iov_iter_get_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + if (i->type & ITER_BVEC) + return get_pages_bvec(i, pages, maxsize, maxpages, start); + else + return get_pages_iovec(i, pages, maxsize, maxpages, start); +} +EXPORT_SYMBOL(iov_iter_get_pages); + +ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + if (i->type & ITER_BVEC) + return get_pages_alloc_bvec(i, pages, maxsize, start); + else + return get_pages_alloc_iovec(i, pages, maxsize, start); +} +EXPORT_SYMBOL(iov_iter_get_pages_alloc); + +int iov_iter_npages(const struct iov_iter *i, int maxpages) +{ + if (i->type & ITER_BVEC) + return iov_iter_npages_bvec(i, maxpages); + else + return iov_iter_npages_iovec(i, maxpages); +} +EXPORT_SYMBOL(iov_iter_npages); diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index ff0d9779cec..dcdcadb6953 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "kmemleak: " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> @@ -50,25 +52,25 @@ static int __init kmemleak_test_init(void) printk(KERN_INFO "Kmemleak testing\n"); /* make some orphan objects */ - pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); #ifndef CONFIG_MODULES - pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = %p\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); - pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = %p\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); #endif - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); /* * Add elements to a list. They should only appear as orphan @@ -76,7 +78,7 @@ static int __init kmemleak_test_init(void) */ for (i = 0; i < 10; i++) { elem = kzalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); + pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; INIT_LIST_HEAD(&elem->list); @@ -85,7 +87,7 @@ static int __init kmemleak_test_init(void) for_each_possible_cpu(i) { per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); - pr_info("kmemleak: kmalloc(129) = %p\n", + pr_info("kmalloc(129) = %p\n", per_cpu(kmemleak_test_pointer, i)); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 31f01c5011e..3cda50c1e39 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -192,15 +192,15 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static atomic_t kmemleak_enabled = ATOMIC_INIT(0); +static int kmemleak_enabled; /* set in the late_initcall if there were no errors */ -static atomic_t kmemleak_initialized = ATOMIC_INIT(0); +static int kmemleak_initialized; /* enables or disables early logging of the memory operations */ -static atomic_t kmemleak_early_log = ATOMIC_INIT(1); +static int kmemleak_early_log = 1; /* set if a kmemleak warning was issued */ -static atomic_t kmemleak_warning = ATOMIC_INIT(0); +static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ -static atomic_t kmemleak_error = ATOMIC_INIT(0); +static int kmemleak_error; /* minimum and maximum address that may be valid pointers */ static unsigned long min_addr = ULONG_MAX; @@ -218,7 +218,8 @@ static int kmemleak_stack_scan = 1; static DEFINE_MUTEX(scan_mutex); /* setting kmemleak=on, will set this var, skipping the disable */ static int kmemleak_skip_disable; - +/* If there are leaks that can be reported */ +static bool kmemleak_found_leaks; /* * Early object allocation/freeing logging. Kmemleak is initialized after the @@ -267,7 +268,7 @@ static void kmemleak_disable(void); #define kmemleak_warn(x...) do { \ pr_warning(x); \ dump_stack(); \ - atomic_set(&kmemleak_warning, 1); \ + kmemleak_warning = 1; \ } while (0) /* @@ -386,7 +387,7 @@ static void dump_object_info(struct kmemleak_object *object) pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%lx\n", object->flags); - pr_notice(" checksum = %d\n", object->checksum); + pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } @@ -805,7 +806,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size, unsigned long flags; struct early_log *log; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* kmemleak stopped recording, just count the requests */ crt_early_log++; return; @@ -840,7 +841,7 @@ static void early_alloc(struct early_log *log) unsigned long flags; int i; - if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) return; /* @@ -893,9 +894,9 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, { pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -919,11 +920,11 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) * Percpu allocations are only scanned and not reported as leaks * (min_count is set to 0). */ - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), size, 0, GFP_KERNEL); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -939,9 +940,9 @@ void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -959,9 +960,9 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -979,16 +980,50 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); /** + * kmemleak_update_trace - update object allocation stack trace + * @ptr: pointer to beginning of the object + * + * Override the object allocation stack trace for cases where the actual + * allocation place is not always useful. + */ +void __ref kmemleak_update_trace(const void *ptr) +{ + struct kmemleak_object *object; + unsigned long flags; + + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr)) + return; + + object = find_and_get_object((unsigned long)ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Updating stack trace for unknown object at %p\n", + ptr); +#endif + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->trace_len = __save_stack_trace(object->trace); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); +} +EXPORT_SYMBOL(kmemleak_update_trace); + +/** * kmemleak_not_leak - mark an allocated object as false positive * @ptr: pointer to beginning of the object * @@ -999,9 +1034,9 @@ void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -1019,9 +1054,9 @@ void __ref kmemleak_ignore(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1041,9 +1076,9 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -1061,9 +1096,9 @@ void __ref kmemleak_no_scan(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); @@ -1088,7 +1123,7 @@ static bool update_checksum(struct kmemleak_object *object) */ static int scan_should_stop(void) { - if (!atomic_read(&kmemleak_enabled)) + if (!kmemleak_enabled) return 1; /* @@ -1299,7 +1334,7 @@ static void kmemleak_scan(void) /* * Struct page scanning for each node. */ - lock_memory_hotplug(); + get_online_mems(); for_each_online_node(i) { unsigned long start_pfn = node_start_pfn(i); unsigned long end_pfn = node_end_pfn(i); @@ -1317,7 +1352,7 @@ static void kmemleak_scan(void) scan_block(page, page + 1, NULL, 1); } } - unlock_memory_hotplug(); + put_online_mems(); /* * Scanning the task stacks (may introduce false negatives). @@ -1382,9 +1417,12 @@ static void kmemleak_scan(void) } rcu_read_unlock(); - if (new_leaks) + if (new_leaks) { + kmemleak_found_leaks = true; + pr_info("%d new suspected memory leaks (see " "/sys/kernel/debug/kmemleak)\n", new_leaks); + } } @@ -1545,11 +1583,6 @@ static int kmemleak_open(struct inode *inode, struct file *file) return seq_open(file, &kmemleak_seq_ops); } -static int kmemleak_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - static int dump_str_object_info(const char *str) { unsigned long flags; @@ -1592,8 +1625,12 @@ static void kmemleak_clear(void) spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); + + kmemleak_found_leaks = false; } +static void __kmemleak_do_cleanup(void); + /* * File write operation to configure kmemleak at run-time. The following * commands can be written to the /sys/kernel/debug/kmemleak file: @@ -1606,7 +1643,8 @@ static void kmemleak_clear(void) * disable it) * scan - trigger a memory scan * clear - mark all current reported unreferenced kmemleak objects as - * grey to ignore printing them + * grey to ignore printing them, or free all kmemleak objects + * if kmemleak has been disabled. * dump=... - dump information about the object found at the given address */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, @@ -1616,9 +1654,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, int buf_size; int ret; - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; - buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; @@ -1628,6 +1663,19 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, if (ret < 0) return ret; + if (strncmp(buf, "clear", 5) == 0) { + if (kmemleak_enabled) + kmemleak_clear(); + else + __kmemleak_do_cleanup(); + goto out; + } + + if (!kmemleak_enabled) { + ret = -EBUSY; + goto out; + } + if (strncmp(buf, "off", 3) == 0) kmemleak_disable(); else if (strncmp(buf, "stack=on", 8) == 0) @@ -1651,8 +1699,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, } } else if (strncmp(buf, "scan", 4) == 0) kmemleak_scan(); - else if (strncmp(buf, "clear", 5) == 0) - kmemleak_clear(); else if (strncmp(buf, "dump=", 5) == 0) ret = dump_str_object_info(buf + 5); else @@ -1674,9 +1720,19 @@ static const struct file_operations kmemleak_fops = { .read = seq_read, .write = kmemleak_write, .llseek = seq_lseek, - .release = kmemleak_release, + .release = seq_release, }; +static void __kmemleak_do_cleanup(void) +{ + struct kmemleak_object *object; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) + delete_object_full(object->pointer); + rcu_read_unlock(); +} + /* * Stop the memory scanning thread and free the kmemleak internal objects if * no previous scan thread (otherwise, kmemleak may still have some useful @@ -1684,18 +1740,14 @@ static const struct file_operations kmemleak_fops = { */ static void kmemleak_do_cleanup(struct work_struct *work) { - struct kmemleak_object *object; - bool cleanup = scan_thread == NULL; - mutex_lock(&scan_mutex); stop_scan_thread(); - if (cleanup) { - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) - delete_object_full(object->pointer); - rcu_read_unlock(); - } + if (!kmemleak_found_leaks) + __kmemleak_do_cleanup(); + else + pr_info("Kmemleak disabled without freeing internal data. " + "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); mutex_unlock(&scan_mutex); } @@ -1708,14 +1760,14 @@ static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); static void kmemleak_disable(void) { /* atomically check whether it was already invoked */ - if (atomic_cmpxchg(&kmemleak_error, 0, 1)) + if (cmpxchg(&kmemleak_error, 0, 1)) return; /* stop any memory operation tracing */ - atomic_set(&kmemleak_enabled, 0); + kmemleak_enabled = 0; /* check whether it is too early for a kernel thread */ - if (atomic_read(&kmemleak_initialized)) + if (kmemleak_initialized) schedule_work(&cleanup_work); pr_info("Kernel memory leak detector disabled\n"); @@ -1759,7 +1811,7 @@ void __init kmemleak_init(void) #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { - atomic_set(&kmemleak_early_log, 0); + kmemleak_early_log = 0; kmemleak_disable(); return; } @@ -1777,12 +1829,12 @@ void __init kmemleak_init(void) /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); - atomic_set(&kmemleak_early_log, 0); - if (atomic_read(&kmemleak_error)) { + kmemleak_early_log = 0; + if (kmemleak_error) { local_irq_restore(flags); return; } else - atomic_set(&kmemleak_enabled, 1); + kmemleak_enabled = 1; local_irq_restore(flags); /* @@ -1826,9 +1878,9 @@ void __init kmemleak_init(void) log->op_type); } - if (atomic_read(&kmemleak_warning)) { + if (kmemleak_warning) { print_log_trace(log); - atomic_set(&kmemleak_warning, 0); + kmemleak_warning = 0; } } } @@ -1840,9 +1892,9 @@ static int __init kmemleak_late_init(void) { struct dentry *dentry; - atomic_set(&kmemleak_initialized, 1); + kmemleak_initialized = 1; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately @@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; - BUG_ON(pmd_trans_huge(*pmd)); mmun_start = addr; mmun_end = addr + PAGE_SIZE; @@ -1979,18 +1978,12 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE -static int just_wait(void *word) -{ - schedule(); - return 0; -} - static void wait_while_offlining(void) { while (ksm_run & KSM_RUN_OFFLINE) { mutex_unlock(&ksm_thread_mutex); wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), - just_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); mutex_lock(&ksm_thread_mutex); } } diff --git a/mm/list_lru.c b/mm/list_lru.c index 72f9decb010..f1a0db19417 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -87,11 +87,20 @@ restart: ret = isolate(item, &nlru->lock, cb_arg); switch (ret) { + case LRU_REMOVED_RETRY: + assert_spin_locked(&nlru->lock); case LRU_REMOVED: if (--nlru->nr_items == 0) node_clear(nid, lru->active_nodes); WARN_ON_ONCE(nlru->nr_items < 0); isolated++; + /* + * If the lru lock has been dropped, our list + * traversal is now invalid and so we have to + * restart from scratch. + */ + if (ret == LRU_REMOVED_RETRY) + goto restart; break; case LRU_ROTATE: list_move_tail(item, &nlru->list); @@ -103,6 +112,7 @@ restart: * The lru lock has been dropped, our list traversal is * now invalid and so we have to restart from scratch. */ + assert_spin_locked(&nlru->lock); goto restart; default: BUG(); @@ -114,7 +124,7 @@ restart: } EXPORT_SYMBOL_GPL(list_lru_walk_node); -int list_lru_init(struct list_lru *lru) +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; @@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru) nodes_clear(lru->active_nodes); for (i = 0; i < nr_node_ids; i++) { spin_lock_init(&lru->node[i].lock); + if (key) + lockdep_set_class(&lru->node[i].lock, key); INIT_LIST_HEAD(&lru->node[i].list); lru->node[i].nr_items = 0; } return 0; } -EXPORT_SYMBOL_GPL(list_lru_init); +EXPORT_SYMBOL_GPL(list_lru_init_key); void list_lru_destroy(struct list_lru *lru) { diff --git a/mm/madvise.c b/mm/madvise.c index 539eeb96b32..0938b30da4a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, for (; start < end; start += PAGE_SIZE) { index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - page = find_get_page(mapping, index); + page = find_get_entry(mapping, index); if (!radix_tree_exceptional_entry(page)) { if (page) page_cache_release(page); @@ -292,9 +292,6 @@ static long madvise_dontneed(struct vm_area_struct *vma, /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. - * - * NOTE: Currently, only shmfs/tmpfs is supported for this operation. - * Other filesystems return -ENOSYS. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, diff --git a/mm/memblock.c b/mm/memblock.c index 39a31e7f004..6ecb0d937fb 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -27,6 +27,9 @@ static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; +#endif struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, @@ -37,6 +40,12 @@ struct memblock memblock __initdata_memblock = { .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + .physmem.regions = memblock_physmem_init_regions, + .physmem.cnt = 1, /* empty dummy entry */ + .physmem.max = INIT_PHYSMEM_REGIONS, +#endif + .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; @@ -183,8 +192,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid) { - int ret; - phys_addr_t kernel_end; + phys_addr_t kernel_end, ret; /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) @@ -472,7 +480,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, } /** - * memblock_add_region - add new memblock region + * memblock_add_range - add new memblock region * @type: memblock type to add new region into * @base: base address of the new region * @size: size of the new region @@ -487,7 +495,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * RETURNS: * 0 on success, -errno on failure. */ -static int __init_memblock memblock_add_region(struct memblock_type *type, +int __init_memblock memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int nid, unsigned long flags) { @@ -569,12 +577,12 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { - return memblock_add_region(&memblock.memory, base, size, nid, 0); + return memblock_add_range(&memblock.memory, base, size, nid, 0); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_region(&memblock.memory, base, size, + return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } @@ -654,8 +662,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, return 0; } -static int __init_memblock __memblock_remove(struct memblock_type *type, - phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_remove_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { int start_rgn, end_rgn; int i, ret; @@ -671,9 +679,10 @@ static int __init_memblock __memblock_remove(struct memblock_type *type, int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { - return __memblock_remove(&memblock.memory, base, size); + return memblock_remove_range(&memblock.memory, base, size); } + int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", @@ -681,7 +690,8 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) (unsigned long long)base + size - 1, (void *)_RET_IP_); - return __memblock_remove(&memblock.reserved, base, size); + kmemleak_free_part(__va(base), size); + return memblock_remove_range(&memblock.reserved, base, size); } static int __init_memblock memblock_reserve_region(phys_addr_t base, @@ -696,7 +706,7 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_region(_rgn, base, size, nid, flags); + return memblock_add_range(_rgn, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) @@ -758,17 +768,19 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) } /** - * __next_free_mem_range - next function for for_each_free_mem_range() + * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * - * Find the first free area from *@idx which matches @nid, fill the out + * Find the first area from *@idx which matches @nid, fill the out * parameters, and update *@idx for the next iteration. The lower 32bit of - * *@idx contains index into memory region and the upper 32bit indexes the - * areas before each reserved region. For example, if reserved regions + * *@idx contains index into type_a and the upper 32bit indexes the + * areas before each region in type_b. For example, if type_b regions * look like the following, * * 0:[0-16), 1:[32-48), 2:[128-130) @@ -780,53 +792,81 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_free_mem_range(u64 *idx, int nid, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +void __init_memblock __next_mem_range(u64 *idx, int nid, + struct memblock_type *type_a, + struct memblock_type *type_b, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - struct memblock_type *mem = &memblock.memory; - struct memblock_type *rsv = &memblock.reserved; - int mi = *idx & 0xffffffff; - int ri = *idx >> 32; + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + if (WARN_ONCE(nid == MAX_NUMNODES, + "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; - for ( ; mi < mem->cnt; mi++) { - struct memblock_region *m = &mem->regions[mi]; + for (; idx_a < type_a->cnt; idx_a++) { + struct memblock_region *m = &type_a->regions[idx_a]; + phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != m_nid) continue; - /* scan areas before each reservation for intersection */ - for ( ; ri < rsv->cnt + 1; ri++) { - struct memblock_region *r = &rsv->regions[ri]; - phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; - phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } - /* if ri advanced past mi, break out to advance mi */ + /* scan areas before each reservation */ + for (; idx_b < type_b->cnt + 1; idx_b++) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; + + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : ULLONG_MAX; + + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ if (r_start >= m_end) break; /* if the two regions intersect, we're done */ if (m_start < r_end) { if (out_start) - *out_start = max(m_start, r_start); + *out_start = + max(m_start, r_start); if (out_end) *out_end = min(m_end, r_end); if (out_nid) - *out_nid = memblock_get_region_node(m); + *out_nid = m_nid; /* - * The region which ends first is advanced - * for the next iteration. + * The region which ends first is + * advanced for the next iteration. */ if (m_end <= r_end) - mi++; + idx_a++; else - ri++; - *idx = (u32)mi | (u64)ri << 32; + idx_b++; + *idx = (u32)idx_a | (u64)idx_b << 32; return; } } @@ -837,57 +877,80 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, } /** - * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() + * __next_mem_range_rev - generic next function for for_each_*_range_rev() + * + * Finds the next range from type_a which is not marked as unsuitable + * in type_b. + * * @idx: pointer to u64 loop variable * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * - * Reverse of __next_free_mem_range(). - * - * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't - * be able to hot-remove hotpluggable memory used by the kernel. So this - * function skip hotpluggable regions if needed when allocating memory for the - * kernel. + * Reverse of __next_mem_range(). */ -void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, + struct memblock_type *type_a, + struct memblock_type *type_b, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - struct memblock_type *mem = &memblock.memory; - struct memblock_type *rsv = &memblock.reserved; - int mi = *idx & 0xffffffff; - int ri = *idx >> 32; + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; if (*idx == (u64)ULLONG_MAX) { - mi = mem->cnt - 1; - ri = rsv->cnt; + idx_a = type_a->cnt - 1; + idx_b = type_b->cnt; } - for ( ; mi >= 0; mi--) { - struct memblock_region *m = &mem->regions[mi]; + for (; idx_a >= 0; idx_a--) { + struct memblock_region *m = &type_a->regions[idx_a]; + phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != m_nid) continue; /* skip hotpluggable memory regions if needed */ if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; - /* scan areas before each reservation for intersection */ - for ( ; ri >= 0; ri--) { - struct memblock_region *r = &rsv->regions[ri]; - phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; - phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + + /* scan areas before each reservation */ + for (; idx_b >= 0; idx_b--) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; + + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : ULLONG_MAX; + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ - /* if ri advanced past mi, break out to advance mi */ if (r_end <= m_start) break; /* if the two regions intersect, we're done */ @@ -897,18 +960,17 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, if (out_end) *out_end = min(m_end, r_end); if (out_nid) - *out_nid = memblock_get_region_node(m); - + *out_nid = m_nid; if (m_start >= r_start) - mi--; + idx_a--; else - ri--; - *idx = (u32)mi | (u64)ri << 32; + idx_b--; + *idx = (u32)idx_a | (u64)idx_b << 32; return; } } } - + /* signal end of iteration */ *idx = ULLONG_MAX; } @@ -975,22 +1037,40 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, - phys_addr_t align, phys_addr_t max_addr, - int nid) +static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) { phys_addr_t found; if (!align) align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(size, align, 0, max_addr, nid); - if (found && !memblock_reserve(found, size)) + found = memblock_find_in_range_node(size, align, start, end, nid); + if (found && !memblock_reserve(found, size)) { + /* + * The min_count is set to 0 so that memblock allocations are + * never reported as leaks. + */ + kmemleak_alloc(__va(found), size, 0, 0); return found; - + } return 0; } +phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, + phys_addr_t start, phys_addr_t end) +{ + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); +} + +static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t max_addr, + int nid) +{ + return memblock_alloc_range_nid(size, align, 0, max_addr, nid); +} + phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); @@ -1201,7 +1281,7 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) __func__, (u64)base, (u64)base + size - 1, (void *)_RET_IP_); kmemleak_free_part(__va(base), size); - __memblock_remove(&memblock.reserved, base, size); + memblock_remove_range(&memblock.reserved, base, size); } /* @@ -1253,7 +1333,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) pages += end_pfn - start_pfn; } - return (phys_addr_t)pages << PAGE_SHIFT; + return PFN_PHYS(pages); } /* lowest address */ @@ -1271,16 +1351,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - unsigned long i; phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + struct memblock_region *r; if (!limit) return; /* find out max address */ - for (i = 0; i < memblock.memory.cnt; i++) { - struct memblock_region *r = &memblock.memory.regions[i]; - + for_each_memblock(memory, r) { if (limit <= r->size) { max_addr = r->base + limit; break; @@ -1289,8 +1367,10 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) } /* truncate both memory and reserved regions */ - __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); - __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX); + memblock_remove_range(&memblock.memory, max_addr, + (phys_addr_t)ULLONG_MAX); + memblock_remove_range(&memblock.reserved, max_addr, + (phys_addr_t)ULLONG_MAX); } static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) @@ -1326,14 +1406,13 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { struct memblock_type *type = &memblock.memory; - int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); + int mid = memblock_search(type, PFN_PHYS(pfn)); if (mid == -1) return -1; - *start_pfn = type->regions[mid].base >> PAGE_SHIFT; - *end_pfn = (type->regions[mid].base + type->regions[mid].size) - >> PAGE_SHIFT; + *start_pfn = PFN_DOWN(type->regions[mid].base); + *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); return type->regions[mid].nid; } @@ -1379,13 +1458,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si void __init_memblock memblock_trim_memory(phys_addr_t align) { - int i; phys_addr_t start, end, orig_start, orig_end; - struct memblock_type *mem = &memblock.memory; + struct memblock_region *r; - for (i = 0; i < mem->cnt; i++) { - orig_start = mem->regions[i].base; - orig_end = mem->regions[i].base + mem->regions[i].size; + for_each_memblock(memory, r) { + orig_start = r->base; + orig_end = r->base + r->size; start = round_up(orig_start, align); end = round_down(orig_end, align); @@ -1393,11 +1471,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align) continue; if (start < end) { - mem->regions[i].base = start; - mem->regions[i].size = end - start; + r->base = start; + r->size = end - start; } else { - memblock_remove_region(mem, i); - i--; + memblock_remove_region(&memblock.memory, + r - memblock.memory.regions); + r--; } } } @@ -1407,6 +1486,11 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) memblock.current_limit = limit; } +phys_addr_t __init_memblock memblock_get_current_limit(void) +{ + return memblock.current_limit; +} + static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; @@ -1499,6 +1583,9 @@ static int __init memblock_init_debugfs(void) return -ENXIO; debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops); +#endif return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5b6b0039f72..28928ce9b07 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,8 +66,8 @@ #include <trace/events/vmscan.h> -struct cgroup_subsys mem_cgroup_subsys __read_mostly; -EXPORT_SYMBOL(mem_cgroup_subsys); +struct cgroup_subsys memory_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -80,7 +80,7 @@ int do_swap_account __read_mostly; #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; #else -static int really_do_swap_account __initdata = 0; +static int really_do_swap_account __initdata; #endif #else @@ -292,6 +292,9 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; + /* css_online() has been completed */ + int initialized; + /* * the counter to account for mem+swap usage. */ @@ -357,10 +360,9 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list. per-memcg */ + /* analogous to slab_common's slab_caches list, but per-memcg; + * protected by memcg_slab_mutex */ struct list_head memcg_slab_caches; - /* Not a spinlock, we can take a lot of time walking the list */ - struct mutex slab_caches_mutex; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -527,18 +529,14 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { - /* - * The ID of the root cgroup is 0, but memcg treat 0 as an - * invalid ID, so we return (cgroup_id + 1). - */ - return memcg->css.cgroup->id + 1; + return memcg->css.id; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; - css = css_from_id(id - 1, &mem_cgroup_subsys); + css = css_from_id(id, &memory_cgrp_subsys); return mem_cgroup_from_css(css); } @@ -571,7 +569,8 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); if (!mem_cgroup_is_root(memcg) && - memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { + memcg_proto_active(cg_proto) && + css_tryget_online(&memcg->css)) { sk->sk_cgrp = cg_proto; } rcu_read_unlock(); @@ -677,9 +676,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg) static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { - VM_BUG_ON((unsigned)nid >= nr_node_ids); + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } @@ -689,12 +690,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) +mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); - return mem_cgroup_zoneinfo(memcg, nid, zid); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } static struct mem_cgroup_tree_per_zone * @@ -712,11 +713,9 @@ soft_limit_tree_from_page(struct page *page) return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; } -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -746,10 +745,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, mz->on_tree = true; } -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { if (!mz->on_tree) return; @@ -757,14 +754,14 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, mz->on_tree = false; } -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { - spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(memcg, mz, mctz); - spin_unlock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); + __mem_cgroup_remove_exceeded(mz, mctz); + spin_unlock_irqrestore(&mctz->lock, flags); } @@ -773,47 +770,47 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) unsigned long long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - int nid = page_to_nid(page); - int zid = page_zonenum(page); - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree_from_page(page); /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = mem_cgroup_page_zoneinfo(memcg, page); excess = res_counter_soft_limit_excess(&memcg->res); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. */ if (excess || mz->on_tree) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); /* if on-tree, remove it */ if (mz->on_tree) - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); - spin_unlock(&mctz->lock); + __mem_cgroup_insert_exceeded(mz, mctz, excess); + spin_unlock_irqrestore(&mctz->lock, flags); } } } static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { - int node, zone; - struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_zone *mz; + int nid, zid; - for_each_node(node) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(memcg, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(memcg, mz, mctz); + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + mctz = soft_limit_tree_node_zone(nid, zid); + mem_cgroup_remove_exceeded(mz, mctz); } } } @@ -836,9 +833,9 @@ retry: * we will to add it back at the end of reclaim to its correct * position in the tree. */ - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); if (!res_counter_soft_limit_excess(&mz->memcg->res) || - !css_tryget(&mz->memcg->css)) + !css_tryget_online(&mz->memcg->css)) goto retry; done: return mz; @@ -849,9 +846,9 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) { struct mem_cgroup_per_zone *mz; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); return mz; } @@ -892,13 +889,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg, return val; } -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) -{ - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); -} - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@ -919,15 +909,13 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool anon, int nr_pages) + int nr_pages) { - preempt_disable(); - /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. */ - if (anon) + if (PageAnon(page)) __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); else @@ -947,12 +935,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, } __this_cpu_add(memcg->stat->nr_page_events, nr_pages); - - preempt_enable(); } -unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_zone *mz; @@ -960,46 +945,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, - unsigned int lru_mask) +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, + unsigned int lru_mask) { - struct mem_cgroup_per_zone *mz; - enum lru_list lru; - unsigned long ret = 0; - - mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - for_each_lru(lru) { - if (BIT(lru) & lru_mask) - ret += mz->lru_size[lru]; - } - return ret; -} - -static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - u64 total = 0; + unsigned long nr = 0; int zid; - for (zid = 0; zid < MAX_NR_ZONES; zid++) - total += mem_cgroup_zone_nr_lru_pages(memcg, - nid, zid, lru_mask); + VM_BUG_ON((unsigned)nid >= nr_node_ids); - return total; + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct mem_cgroup_per_zone *mz; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + nr += mz->lru_size[lru]; + } + } + return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { + unsigned long nr = 0; int nid; - u64 total = 0; for_each_node_state(nid, N_MEMORY) - total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); - return total; + nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); + return nr; } static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -1036,7 +1013,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, */ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { - preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@ -1049,8 +1025,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); #endif - preempt_enable(); - mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); @@ -1058,8 +1032,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); #endif - } else - preempt_enable(); + } } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -1072,26 +1045,28 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } -struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; - if (!mm) - return NULL; - /* - * Because we have no locks, mm->owner's may be being moved to other - * cgroup. We use css_tryget() here even if this looks - * pessimistic (rather than adding locks here). - */ rcu_read_lock(); do { - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) - break; - } while (!css_tryget(&memcg->css)); + /* + * Page cache insertions can happen withou an + * actual mm context, e.g. during disk probing + * on boot, loopback IO, acct() writes etc. + */ + if (unlikely(!mm)) + memcg = root_mem_cgroup; + else { + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + memcg = root_mem_cgroup; + } + } while (!css_tryget_online(&memcg->css)); rcu_read_unlock(); return memcg; } @@ -1127,9 +1102,21 @@ skip_node: * skipping css reference should be safe. */ if (next_css) { - if ((next_css == &root->css) || - ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) - return mem_cgroup_from_css(next_css); + struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); + + if (next_css == &root->css) + return memcg; + + if (css_tryget_online(next_css)) { + /* + * Make sure the memcg is initialized: + * mem_cgroup_css_online() orders the the + * initialization against setting the flag. + */ + if (smp_load_acquire(&memcg->initialized)) + return memcg; + css_put(next_css); + } prev_css = next_css; goto skip_node; @@ -1174,7 +1161,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, * would be returned all the time. */ if (position && position != root && - !css_tryget(&position->css)) + !css_tryget_online(&position->css)) position = NULL; } return position; @@ -1245,11 +1232,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, int uninitialized_var(seq); if (reclaim) { - int nid = zone_to_nid(reclaim->zone); - int zid = zone_idx(reclaim->zone); struct mem_cgroup_per_zone *mz; - mz = mem_cgroup_zoneinfo(root, nid, zid); + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); iter = &mz->reclaim_iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) { iter->last_visited = NULL; @@ -1356,7 +1341,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, goto out; } - mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + mz = mem_cgroup_zone_zoneinfo(memcg, zone); lruvec = &mz->lruvec; out: /* @@ -1369,20 +1354,6 @@ out: return lruvec; } -/* - * Following LRU functions are allowed to be used without PCG_LOCK. - * Operations are called by routine of global LRU independently from memcg. - * What we have to take care of here is validness of pc->mem_cgroup. - * - * Changes to pc->mem_cgroup happens when - * 1. charge - * 2. moving account - * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. - * It is added to LRU before charge. - * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. - * When moving account, the page is not on LRU. It's isolated. - */ - /** * mem_cgroup_page_lruvec - return lruvec for adding an lru page * @page: the page @@ -1415,7 +1386,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) pc->mem_cgroup = memcg = root_mem_cgroup; - mz = page_cgroup_zoneinfo(memcg, page); + mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; out: /* @@ -1486,7 +1457,7 @@ bool task_in_mem_cgroup(struct task_struct *task, p = find_lock_task_mm(task); if (p) { - curr = try_get_mem_cgroup_from_mm(p->mm); + curr = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1500,8 +1471,6 @@ bool task_in_mem_cgroup(struct task_struct *task, css_get(&curr->css); rcu_read_unlock(); } - if (!curr) - return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is @@ -1555,7 +1524,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* root ? */ - if (!css_parent(&memcg->css)) + if (mem_cgroup_disabled() || !memcg->css.parent) return vm_swappiness; return memcg->swappiness; @@ -1599,23 +1568,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) } /* - * 2 routines for checking "mem" is under move_account() or not. + * A routine for checking "mem" is under move_account() or not. * - * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This - * is used for avoiding races in accounting. If true, - * pc->mem_cgroup may be overwritten. - * - * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or - * under hierarchy of moving cgroups. This is for - * waiting at hith-memory prressure caused by "move". + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move". */ - -static bool mem_cgroup_stolen(struct mem_cgroup *memcg) -{ - VM_BUG_ON(!rcu_read_lock_held()); - return atomic_read(&memcg->moving_account) > 0; -} - static bool mem_cgroup_under_move(struct mem_cgroup *memcg) { struct mem_cgroup *from; @@ -1658,7 +1616,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) * Take this lock when * - a code tries to modify page's memcg while it's USED. * - a code tries to modify page state accounting in a memcg. - * see mem_cgroup_stolen(), too. */ static void move_lock_mem_cgroup(struct mem_cgroup *memcg, unsigned long *flags) @@ -1683,15 +1640,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* - * protects memcg_name and makes sure that parallel ooms do not - * interleave - */ + /* oom_info_lock ensures that parallel ooms do not interleave */ static DEFINE_MUTEX(oom_info_lock); - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; - static char memcg_name[PATH_MAX]; - int ret; struct mem_cgroup *iter; unsigned int i; @@ -1701,36 +1651,14 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) mutex_lock(&oom_info_lock); rcu_read_lock(); - mem_cgrp = memcg->css.cgroup; - task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); - - ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - /* - * Unfortunately, we are unable to convert to a useful name - * But we'll still print out the usage information - */ - rcu_read_unlock(); - goto done; - } - rcu_read_unlock(); - - pr_info("Task in %s killed", memcg_name); + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_info(" killed as a result of limit of "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_info("\n"); - rcu_read_lock(); - ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - rcu_read_unlock(); - goto done; - } rcu_read_unlock(); - /* - * Continues from above, so we don't need an KERN_ level - */ - pr_cont(" as a result of limit of %s\n", memcg_name); -done: - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, @@ -1745,13 +1673,8 @@ done: res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); for_each_mem_cgroup_tree(iter, memcg) { - pr_info("Memory cgroup stats"); - - rcu_read_lock(); - ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); - if (!ret) - pr_cont(" for %s", memcg_name); - rcu_read_unlock(); + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(iter->css.cgroup); pr_cont(":"); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { @@ -2327,27 +2250,18 @@ cleanup: } /* - * Currently used to update mapped file statistics, but the routine can be - * generalized to update other statistics as well. + * Used to update mapped file or writeback or other statistics. * * Notes: Race condition * - * We usually use page_cgroup_lock() for accessing page_cgroup member but - * it tends to be costly. But considering some conditions, we doesn't need - * to do so _always_. - * - * Considering "charge", lock_page_cgroup() is not required because all - * file-stat operations happen after a page is attached to radix-tree. There - * are no race with "charge". + * Charging occurs during page instantiation, while the page is + * unmapped and locked in page migration, or while the page table is + * locked in THP migration. No race is possible. * - * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup - * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even - * if there are race with "uncharge". Statistics itself is properly handled - * by flags. + * Uncharge happens to pages with zero references, no race possible. * - * Considering "move", this is an only case we see a race. To make the race - * small, we check mm->moving_account and detect there are possibility of race - * If there is, we take a lock. + * Charge moving between groups is protected by checking mm->moving + * account and taking the move_lock in the slowpath. */ void __mem_cgroup_begin_update_page_stat(struct page *page, @@ -2365,9 +2279,10 @@ again: * If this memory cgroup is not under account moving, we don't * need to take move_lock_mem_cgroup(). Because we already hold * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock() if mem_cgroup_stolen() == true. + * rcu_read_unlock(). */ - if (!mem_cgroup_stolen(memcg)) + VM_BUG_ON(!rcu_read_lock_held()); + if (atomic_read(&memcg->moving_account) <= 0) return; move_lock_mem_cgroup(memcg, flags); @@ -2475,7 +2390,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } @@ -2621,55 +2536,65 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } - -/* See __mem_cgroup_try_charge() for details */ -enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ -}; - -static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) +static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { - unsigned long csize = nr_pages * PAGE_SIZE; + unsigned int batch = max(CHARGE_BATCH, nr_pages); + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; + unsigned long nr_reclaimed; unsigned long flags = 0; - int ret; + unsigned long long size; + int ret = 0; - ret = res_counter_charge(&memcg->res, csize, &fail_res); + if (mem_cgroup_is_root(memcg)) + goto done; +retry: + if (consume_stock(memcg, nr_pages)) + goto done; - if (likely(!ret)) { + size = batch * PAGE_SIZE; + if (!res_counter_charge(&memcg->res, size, &fail_res)) { if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; - - res_counter_uncharge(&memcg->res, csize); + goto done_restock; + if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + goto done_restock; + res_counter_uncharge(&memcg->res, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (nr_pages > min_pages) - return CHARGE_RETRY; + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; + goto nomem; - if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; + nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; + goto retry; + + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge @@ -2679,190 +2604,48 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - + if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; /* * At task move, charge accounts can be doubly counted. So, it's * better to wait until the end of task_move if something is going on. */ if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; -} - -/* - * __mem_cgroup_try_charge() does - * 1. detect memcg to be charged against from passed *mm and *ptr, - * 2. update res_counter - * 3. call memory reclaim if necessary. - * - * In some special case, if the task is fatal, fatal_signal_pending() or - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon - * as possible without any hazards. 2: all pages should have a valid - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg - * pointer, that is treated as a charge to root_mem_cgroup. - * - * So __mem_cgroup_try_charge() will return - * 0 ... on success, filling *ptr with a valid memcg pointer. - * -ENOMEM ... charge failure because of resource limits. - * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. - * - * Unlike the exported interface, an "oom" parameter is added. if oom==true, - * the oom-killer can be invoked. - */ -static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - struct mem_cgroup **ptr, - bool oom) -{ - unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct mem_cgroup *memcg = NULL; - int ret; - - /* - * Unlike gloval-vm's OOM-kill, we're not in memory shortage - * in system level. So, allow to go ahead dying process in addition to - * MEMDIE process. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) - || fatal_signal_pending(current))) - goto bypass; + goto retry; - if (unlikely(task_in_memcg_oom(current))) - goto nomem; + if (nr_retries--) + goto retry; if (gfp_mask & __GFP_NOFAIL) - oom = false; - - /* - * We always charge the cgroup the mm_struct belongs to. - * The mm_struct's mem_cgroup changes on task migration if the - * thread group leader migrates. It's possible that mm is not - * set, if so charge the root memcg (happens for pagecache usage). - */ - if (!*ptr && !mm) - *ptr = root_mem_cgroup; -again: - if (*ptr) { /* css should be a valid one */ - memcg = *ptr; - if (mem_cgroup_is_root(memcg)) - goto done; - if (consume_stock(memcg, nr_pages)) - goto done; - css_get(&memcg->css); - } else { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(mm->owner); - /* - * Because we don't have task_lock(), "p" can exit. - * In that case, "memcg" can point to root or p can be NULL with - * race with swapoff. Then, we have small risk of mis-accouning. - * But such kind of mis-account by race always happens because - * we don't have cgroup_mutex(). It's overkill and we allo that - * small race, here. - * (*) swapoff at el will charge against mm-struct not against - * task-struct. So, mm->owner can be NULL. - */ - memcg = mem_cgroup_from_task(p); - if (!memcg) - memcg = root_mem_cgroup; - if (mem_cgroup_is_root(memcg)) { - rcu_read_unlock(); - goto done; - } - if (consume_stock(memcg, nr_pages)) { - /* - * It seems dagerous to access memcg without css_get(). - * But considering how consume_stok works, it's not - * necessary. If consume_stock success, some charges - * from this memcg are cached on this cpu. So, we - * don't need to call css_get()/css_tryget() before - * calling consume_stock(). - */ - rcu_read_unlock(); - goto done; - } - /* after here, we may be blocked. we need to get refcnt */ - if (!css_tryget(&memcg->css)) { - rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); - } - - do { - bool invoke_oom = oom && !nr_oom_retries; - - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) { - css_put(&memcg->css); - goto bypass; - } + goto bypass; - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - css_put(&memcg->css); - memcg = NULL; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - css_put(&memcg->css); - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) { - css_put(&memcg->css); - goto nomem; - } - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (fatal_signal_pending(current)) + goto bypass; - if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); - css_put(&memcg->css); -done: - *ptr = memcg; - return 0; + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); nomem: - if (!(gfp_mask & __GFP_NOFAIL)) { - *ptr = NULL; + if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; - } bypass: - *ptr = root_mem_cgroup; return -EINTR; + +done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); +done: + return ret; } -/* - * Somemtimes we have to undo a charge we got by try_charge(). - * This function is for that and do uncharge, put css's refcnt. - * gotten by try_charge(). - */ -static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) +static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + if (mem_cgroup_is_root(memcg)) + return; + + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); } /* @@ -2885,9 +2668,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, /* * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling css_tryget if - * the mem_cgroup is used for charging. (dropping refcnt from swap can be - * called against removed memcg.) + * rcu_read_lock(). The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.) */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { @@ -2897,6 +2680,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return mem_cgroup_from_id(id); } +/* + * try_get_mem_cgroup_from_page - look up page's memcg association + * @page: the page + * + * Look up, get a css reference, and return the memcg that owns @page. + * + * The page must be locked to prevent racing with swap-in and page + * cache charges. If coming from an unlocked page table, the caller + * must ensure the page is on the LRU or this can race with charging. + */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg = NULL; @@ -2907,37 +2700,59 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page), page); pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; - if (memcg && !css_tryget(&memcg->css)) + if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup_id(ent); rcu_read_lock(); memcg = mem_cgroup_lookup(id); - if (memcg && !css_tryget(&memcg->css)) + if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; rcu_read_unlock(); } - unlock_page_cgroup(pc); return memcg; } -static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, - struct page *page, - unsigned int nr_pages, - enum charge_type ctype, - bool lrucare) +static void lock_page_lru(struct page *page, int *isolated) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + *isolated = 1; + } else + *isolated = 0; +} + +static void unlock_page_lru(struct page *page, int isolated) +{ + struct zone *zone = page_zone(page); + + if (isolated) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + VM_BUG_ON_PAGE(PageLRU(page), page); + SetPageLRU(page); + add_page_to_lru_list(page, lruvec, page_lru(page)); + } + spin_unlock_irq(&zone->lru_lock); +} + +static void commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) { struct page_cgroup *pc = lookup_page_cgroup(page); - struct zone *uninitialized_var(zone); - struct lruvec *lruvec; - bool was_on_lru = false; - bool anon; + int isolated; - lock_page_cgroup(pc); VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); /* * we don't need page_cgroup_lock about tail pages, becase they are not @@ -2948,57 +2763,39 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page * may already be on some other mem_cgroup's LRU. Take care of it. */ - if (lrucare) { - zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page)) { - lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - was_on_lru = true; - } - } + if (lrucare) + lock_page_lru(page, &isolated); - pc->mem_cgroup = memcg; /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point: + * + * - the page is uncharged + * + * - the page is off-LRU + * + * - an anonymous fault has exclusive page access, except for + * a locked page table + * + * - a page cache insertion, a swapin fault, or a migration + * have the page locked */ - smp_wmb(); - SetPageCgroupUsed(pc); - - if (lrucare) { - if (was_on_lru) { - lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, page_lru(page)); - } - spin_unlock_irq(&zone->lru_lock); - } - - if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) - anon = true; - else - anon = false; - - mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); - unlock_page_cgroup(pc); + pc->mem_cgroup = memcg; + pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); - /* - * "charge_statistics" updated event counter. Then, check it. - * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. - * if they exceeds softlimit. - */ - memcg_check_events(memcg, page); + if (lrucare) + unlock_page_lru(page, isolated); } static DEFINE_MUTEX(set_limit_mutex); #ifdef CONFIG_MEMCG_KMEM +/* + * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or + * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. + */ +static DEFINE_MUTEX(memcg_slab_mutex); + static DEFINE_MUTEX(activate_kmem_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) @@ -3031,10 +2828,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) print_slabinfo_header(m); - mutex_lock(&memcg->slab_caches_mutex); + mutex_lock(&memcg_slab_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); return 0; } @@ -3043,31 +2840,27 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; - struct mem_cgroup *_memcg; int ret = 0; ret = res_counter_charge(&memcg->kmem, size, &fail_res); if (ret) return ret; - _memcg = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, - &_memcg, oom_gfp_allowed(gfp)); - + ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* - * __mem_cgroup_try_charge() chosed to bypass to root due to - * OOM kill or fatal signal. Since our only options are to - * either fail the allocation or charge it to this cgroup, do - * it as a temporary condition. But we can't fail. From a - * kmem/slab perspective, the cache has already been selected, - * by mem_cgroup_kmem_get_cache(), so it is too late to change + * try_charge() chose to bypass to root due to OOM kill or + * fatal signal. Since our only options are to either fail + * the allocation or charge it to this cgroup, do it as a + * temporary condition. But we can't fail. From a kmem/slab + * perspective, the cache has already been selected, by + * mem_cgroup_kmem_get_cache(), so it is too late to change * our minds. * * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed during - * __mem_cgroup_try_charge() above. Tasks that were already - * dying when the allocation triggers should have been already + * memcg_charge_kmem in a sane state, but was OOM-killed + * during try_charge() above. Tasks that were already dying + * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ res_counter_charge_nofail(&memcg->res, size, &fail_res); @@ -3139,8 +2932,6 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = memcg_caches_array_size(num); } -static void kmem_cache_destroy_work_func(struct work_struct *w); - int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; @@ -3214,8 +3005,7 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); + css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@ -3224,30 +3014,44 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, void memcg_free_cache_params(struct kmem_cache *s) { + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + css_put(&s->memcg_params->memcg->css); kfree(s->memcg_params); } -void memcg_register_cache(struct kmem_cache *s) +static void memcg_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *root; - struct mem_cgroup *memcg; + static char memcg_name_buf[NAME_MAX + 1]; /* protected by + memcg_slab_mutex */ + struct kmem_cache *cachep; int id; - if (is_root_cache(s)) - return; + lockdep_assert_held(&memcg_slab_mutex); + + id = memcg_cache_id(memcg); /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. */ - lockdep_assert_held(&slab_mutex); - - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; - id = memcg_cache_id(memcg); + if (cache_from_memcg_idx(root_cache, id)) + return; - css_get(&memcg->css); + cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); + cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); + /* + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. + */ + if (!cachep) + return; + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -3256,51 +3060,30 @@ void memcg_register_cache(struct kmem_cache *s) */ smp_wmb(); - /* - * Initialize the pointer to this cache in its parent's memcg_params - * before adding it to the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id]); - root->memcg_params->memcg_caches[id] = s; - - mutex_lock(&memcg->slab_caches_mutex); - list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id]); + root_cache->memcg_params->memcg_caches[id] = cachep; } -void memcg_unregister_cache(struct kmem_cache *s) +static void memcg_unregister_cache(struct kmem_cache *cachep) { - struct kmem_cache *root; + struct kmem_cache *root_cache; struct mem_cgroup *memcg; int id; - if (is_root_cache(s)) - return; + lockdep_assert_held(&memcg_slab_mutex); - /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. - */ - lockdep_assert_held(&slab_mutex); + BUG_ON(is_root_cache(cachep)); - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; + root_cache = cachep->memcg_params->root_cache; + memcg = cachep->memcg_params->memcg; id = memcg_cache_id(memcg); - mutex_lock(&memcg->slab_caches_mutex); - list_del(&s->memcg_params->list); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); + root_cache->memcg_params->memcg_caches[id] = NULL; - /* - * Clear the pointer to this cache in its parent's memcg_params only - * after removing it from the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(!root->memcg_params->memcg_caches[id]); - root->memcg_params->memcg_caches[id] = NULL; + list_del(&cachep->memcg_params->list); - css_put(&memcg->css); + kmem_cache_destroy(cachep); } /* @@ -3334,195 +3117,74 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -static void kmem_cache_destroy_work_func(struct work_struct *w) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *p; - - p = container_of(w, struct memcg_cache_params, destroy); - - cachep = memcg_params_to_cache(p); - - /* - * If we get down to 0 after shrink, we could delete right away. - * However, memcg_release_pages() already puts us back in the workqueue - * in that case. If we proceed deleting, we'll get a dangling - * reference, and removing the object from the workqueue in that case - * is unnecessary complication. We are not a fast path. - * - * Note that this case is fundamentally different from racing with - * shrink_slab(): if memcg_cgroup_destroy_cache() is called in - * kmem_cache_shrink, not only we would be reinserting a dead cache - * into the queue, but doing so from inside the worker racing to - * destroy it. - * - * So if we aren't down to zero, we'll just schedule a worker and try - * again - */ - if (atomic_read(&cachep->memcg_params->nr_pages) != 0) - kmem_cache_shrink(cachep); - else - kmem_cache_destroy(cachep); -} - -void mem_cgroup_destroy_cache(struct kmem_cache *cachep) -{ - if (!cachep->memcg_params->dead) - return; - - /* - * There are many ways in which we can get here. - * - * We can get to a memory-pressure situation while the delayed work is - * still pending to run. The vmscan shrinkers can then release all - * cache memory and get us to destruction. If this is the case, we'll - * be executed twice, which is a bug (the second time will execute over - * bogus data). In this case, cancelling the work should be fine. - * - * But we can also get here from the worker itself, if - * kmem_cache_shrink is enough to shake all the remaining objects and - * get the page count to 0. In this case, we'll deadlock if we try to - * cancel the work (the worker runs with an internal lock held, which - * is the same lock we would hold for cancel_work_sync().) - * - * Since we can't possibly know who got us here, just refrain from - * running if there is already work pending - */ - if (work_pending(&cachep->memcg_params->destroy)) - return; - /* - * We have to defer the actual destroying to a workqueue, because - * we might currently be in a context that cannot sleep. - */ - schedule_work(&cachep->memcg_params->destroy); -} - -static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *s) -{ - struct kmem_cache *new = NULL; - static char *tmp_name = NULL; - static DEFINE_MUTEX(mutex); /* protects tmp_name */ - - BUG_ON(!memcg_can_account_kmem(memcg)); - - mutex_lock(&mutex); - /* - * kmem_cache_create_memcg duplicates the given name and - * cgroup_name for this name requires RCU context. - * This static temporary buffer is used to prevent from - * pointless shortliving allocation. - */ - if (!tmp_name) { - tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); - if (!tmp_name) - goto out; - } - - rcu_read_lock(); - snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); - rcu_read_unlock(); - - new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, - (s->flags & ~SLAB_PANIC), s->ctor, s); - if (new) - new->allocflags |= __GFP_KMEMCG; - else - new = s; -out: - mutex_unlock(&mutex); - return new; -} - -void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __memcg_cleanup_cache_params(struct kmem_cache *s) { struct kmem_cache *c; - int i; + int i, failed = 0; - if (!s->memcg_params) - return; - if (!s->memcg_params->is_root_cache) - return; - - /* - * If the cache is being destroyed, we trust that there is no one else - * requesting objects from it. Even if there are, the sanity checks in - * kmem_cache_destroy should caught this ill-case. - * - * Still, we don't want anyone else freeing memcg_caches under our - * noses, which can happen if a new memcg comes to life. As usual, - * we'll take the activate_kmem_mutex to protect ourselves against - * this. - */ - mutex_lock(&activate_kmem_mutex); + mutex_lock(&memcg_slab_mutex); for_each_memcg_cache_index(i) { c = cache_from_memcg_idx(s, i); if (!c) continue; - /* - * We will now manually delete the caches, so to avoid races - * we need to cancel all pending destruction workers and - * proceed with destruction ourselves. - * - * kmem_cache_destroy() will call kmem_cache_shrink internally, - * and that could spawn the workers again: it is likely that - * the cache still have active pages until this very moment. - * This would lead us back to mem_cgroup_destroy_cache. - * - * But that will not execute at all if the "dead" flag is not - * set, so flip it down to guarantee we are in control. - */ - c->memcg_params->dead = false; - cancel_work_sync(&c->memcg_params->destroy); - kmem_cache_destroy(c); + memcg_unregister_cache(c); + + if (cache_from_memcg_idx(s, i)) + failed++; } - mutex_unlock(&activate_kmem_mutex); + mutex_unlock(&memcg_slab_mutex); + return failed; } -struct create_work { - struct mem_cgroup *memcg; - struct kmem_cache *cachep; - struct work_struct work; -}; - -static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static void memcg_unregister_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; - struct memcg_cache_params *params; + struct memcg_cache_params *params, *tmp; if (!memcg_kmem_is_active(memcg)) return; - mutex_lock(&memcg->slab_caches_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + mutex_lock(&memcg_slab_mutex); + list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - cachep->memcg_params->dead = true; - schedule_work(&cachep->memcg_params->destroy); + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + memcg_unregister_cache(cachep); } - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); } -static void memcg_create_cache_work_func(struct work_struct *w) +struct memcg_register_cache_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void memcg_register_cache_func(struct work_struct *w) { - struct create_work *cw; + struct memcg_register_cache_work *cw = + container_of(w, struct memcg_register_cache_work, work); + struct mem_cgroup *memcg = cw->memcg; + struct kmem_cache *cachep = cw->cachep; - cw = container_of(w, struct create_work, work); - memcg_create_kmem_cache(cw->memcg, cw->cachep); - css_put(&cw->memcg->css); + mutex_lock(&memcg_slab_mutex); + memcg_register_cache(memcg, cachep); + mutex_unlock(&memcg_slab_mutex); + + css_put(&memcg->css); kfree(cw); } /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct create_work *cw; + struct memcg_register_cache_work *cw; - cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (cw == NULL) { css_put(&memcg->css); return; @@ -3531,17 +3193,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; - INIT_WORK(&cw->work, memcg_create_cache_work_func); + INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_create_cache_enqueue will recurse. + * in __memcg_schedule_register_cache will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -3550,9 +3212,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ memcg_stop_kmem_account(); - __memcg_create_cache_enqueue(memcg, cachep); + __memcg_schedule_register_cache(memcg, cachep); memcg_resume_kmem_account(); } + +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) +{ + int res; + + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, + PAGE_SIZE << order); + if (!res) + atomic_add(1 << order, &cachep->memcg_params->nr_pages); + return res; +} + +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) +{ + memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); + atomic_sub(1 << order, &cachep->memcg_params->nr_pages); +} + /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. @@ -3591,7 +3271,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, } /* The corresponding put will be done in the workqueue. */ - if (!css_tryget(&memcg->css)) + if (!css_tryget_online(&memcg->css)) goto out; rcu_read_unlock(); @@ -3603,22 +3283,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * * However, there are some clashes that can arrive from locking. * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. + * memcg_create_kmem_cache, this means no further allocation + * could happen with the slab_mutex held. So it's better to + * defer everything. */ - memcg_create_cache_enqueue(memcg, cachep); + memcg_schedule_register_cache(memcg, cachep); return cachep; out: rcu_read_unlock(); return cachep; } -EXPORT_SYMBOL(__memcg_kmem_get_cache); /* * We need to verify if the allocation against current->mm->owner's memcg is @@ -3645,11 +3319,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from @@ -3669,15 +3344,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) if (!current->mm || current->memcg_kmem_skip_account) return true; - memcg = try_get_mem_cgroup_from_mm(current->mm); - - /* - * very rare case described in mem_cgroup_from_task. Unfortunately there - * isn't much we can do without complicating this too much, and it would - * be gfp-dependent anyway. Just let it go - */ - if (unlikely(!memcg)) - return true; + memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_can_account_kmem(memcg)) { css_put(&memcg->css); @@ -3704,12 +3371,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; } void __memcg_kmem_uncharge_pages(struct page *page, int order) @@ -3719,19 +3387,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) pc = lookup_page_cgroup(page); - /* - * Fast unlocked return. Theoretically might have changed, have to - * check again after locking. - */ if (!PageCgroupUsed(pc)) return; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0; /* * We trust that only if there is a memcg associated with the page, it @@ -3744,14 +3404,13 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } #else -static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -3772,27 +3431,13 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ - pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + pc->flags = head_pc->flags; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline -void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, - struct mem_cgroup *to, - unsigned int nr_pages, - enum mem_cgroup_stat_index idx) -{ - /* Update stat data for mem_cgroup */ - preempt_disable(); - __this_cpu_sub(from->stat->count[idx], nr_pages); - __this_cpu_add(to->stat->count[idx], nr_pages); - preempt_enable(); -} - /** * mem_cgroup_move_account - move account of the page * @page: the page @@ -3816,7 +3461,6 @@ static int mem_cgroup_move_account(struct page *page, { unsigned long flags; int ret; - bool anon = PageAnon(page); VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -3830,36 +3474,53 @@ static int mem_cgroup_move_account(struct page *page, if (nr_pages > 1 && !PageTransHuge(page)) goto out; - lock_page_cgroup(pc); + /* + * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup + * of its source page while we change it: page migration takes + * both pages off the LRU, but page cache replacement doesn't. + */ + if (!trylock_page(page)) + goto out; ret = -EINVAL; if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) - goto unlock; + goto out_unlock; move_lock_mem_cgroup(from, &flags); - if (!anon && page_mapped(page)) - mem_cgroup_move_account_page_stat(from, to, nr_pages, - MEM_CGROUP_STAT_FILE_MAPPED); + if (!PageAnon(page) && page_mapped(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } - if (PageWriteback(page)) - mem_cgroup_move_account_page_stat(from, to, nr_pages, - MEM_CGROUP_STAT_WRITEBACK); + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + } - mem_cgroup_charge_statistics(from, page, anon, -nr_pages); + /* + * It is safe to change pc->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */ /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, page, anon, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; -unlock: - unlock_page_cgroup(pc); - /* - * check events - */ + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); + local_irq_enable(); +out_unlock: + unlock_page(page); out: return ret; } @@ -3930,463 +3591,12 @@ out: return ret; } -/* - * Charge the memory controller for page usage. - * Return - * 0 if the charge was successful - * < 0 if the cgroup is over its limit - */ -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) -{ - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - bool oom = true; - int ret; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. - */ - oom = false; - } - - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); - if (ret == -ENOMEM) - return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); - return 0; -} - -int mem_cgroup_newpage_charge(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - if (mem_cgroup_disabled()) - return 0; - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_ANON); -} - -/* - * While swap-in, try_charge -> commit or cancel, the page is locked. - * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is acquired. This refcnt will be consumed by - * "commit()" or removed by "cancel()" - */ -static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, - gfp_t mask, - struct mem_cgroup **memcgp) -{ - struct mem_cgroup *memcg; - struct page_cgroup *pc; - int ret; - - pc = lookup_page_cgroup(page); - /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. - */ - if (PageCgroupUsed(pc)) - return 0; - if (!do_swap_account) - goto charge_cur_mm; - memcg = try_get_mem_cgroup_from_page(page); - if (!memcg) - goto charge_cur_mm; - *memcgp = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); - css_put(&memcg->css); - if (ret == -EINTR) - ret = 0; - return ret; -charge_cur_mm: - ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); - if (ret == -EINTR) - ret = 0; - return ret; -} - -int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t gfp_mask, struct mem_cgroup **memcgp) -{ - *memcgp = NULL; - if (mem_cgroup_disabled()) - return 0; - /* - * A racing thread's fault, or swapoff, may have already - * updated the pte, and even removed page from swap cache: in - * those cases unuse_pte()'s pte_same() test will fail; but - * there's also a KSM case which does need to charge the page. - */ - if (!PageSwapCache(page)) { - int ret; - - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); - if (ret == -EINTR) - ret = 0; - return ret; - } - return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); -} - -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) -{ - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - __mem_cgroup_cancel_charge(memcg, 1); -} - -static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) -{ - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - - __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); - /* - * Now swap is on-memory. This means this page may be - * counted both as mem and swap....double count. - * Fix it by uncharging from memsw. Basically, this SwapCache is stable - * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() - * may call delete_from_swap_cache() before reach here. - */ - if (do_swap_account && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - mem_cgroup_uncharge_swap(ent); - } -} - -void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg) -{ - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_ANON); -} - -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) -{ - struct mem_cgroup *memcg = NULL; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - int ret; - - if (mem_cgroup_disabled()) - return 0; - if (PageCompound(page)) - return 0; - - if (!PageSwapCache(page)) - ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); - else { /* page is swapcache/shmem */ - ret = __mem_cgroup_try_charge_swapin(mm, page, - gfp_mask, &memcg); - if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, type); - } - return ret; -} - -static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages, - const enum charge_type ctype) -{ - struct memcg_batch_info *batch = NULL; - bool uncharge_memsw = true; - - /* If swapout, usage of swap doesn't decrease */ - if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - uncharge_memsw = false; - - batch = ¤t->memcg_batch; - /* - * In usual, we do css_get() when we remember memcg pointer. - * But in this case, we keep res->usage until end of a series of - * uncharges. Then, it's ok to ignore memcg's refcnt. - */ - if (!batch->memcg) - batch->memcg = memcg; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continuously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - - if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; - - if (nr_pages > 1) - goto direct_uncharge; - - /* - * In typical case, batch->memcg == mem. This means we can - * merge a series of uncharges to an uncharge of res_counter. - * If not, we uncharge res_counter ony by one. - */ - if (batch->memcg != memcg) - goto direct_uncharge; - /* remember freed charge and uncharge it later */ - batch->nr_pages++; - if (uncharge_memsw) - batch->memsw_nr_pages++; - return; -direct_uncharge: - res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); - if (uncharge_memsw) - res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); - if (unlikely(batch->memcg != memcg)) - memcg_oom_recover(memcg); -} - -/* - * uncharge if !page_mapped(page) - */ -static struct mem_cgroup * -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, - bool end_migration) -{ - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - bool anon; - - if (mem_cgroup_disabled()) - return NULL; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - /* - * Check if our page_cgroup is valid - */ - pc = lookup_page_cgroup(page); - if (unlikely(!PageCgroupUsed(pc))) - return NULL; - - lock_page_cgroup(pc); - - memcg = pc->mem_cgroup; - - if (!PageCgroupUsed(pc)) - goto unlock_out; - - anon = PageAnon(page); - - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_ANON: - /* - * Generally PageAnon tells if it's the anon statistics to be - * updated; but sometimes e.g. mem_cgroup_uncharge_page() is - * used before page reached the stage of being marked PageAnon. - */ - anon = true; - /* fallthrough */ - case MEM_CGROUP_CHARGE_TYPE_DROP: - /* See mem_cgroup_prepare_migration() */ - if (page_mapped(page)) - goto unlock_out; - /* - * Pages under migration may not be uncharged. But - * end_migration() /must/ be the one uncharging the - * unused post-migration page and so it has to call - * here with the migration bit still set. See the - * res_counter handling below. - */ - if (!end_migration && PageCgroupMigration(pc)) - goto unlock_out; - break; - case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: - if (!PageAnon(page)) { /* Shared memory */ - if (page->mapping && !page_is_file_cache(page)) - goto unlock_out; - } else if (page_mapped(page)) /* Anon */ - goto unlock_out; - break; - default: - break; - } - - mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); - - ClearPageCgroupUsed(pc); - /* - * pc->mem_cgroup is not cleared here. It will be accessed when it's - * freed from LRU. This is safe because uncharged page is expected not - * to be reused (freed soon). Exception is SwapCache, it's handled by - * special functions. - */ - - unlock_page_cgroup(pc); - /* - * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed, so it's safe to call css_get(). - */ - memcg_check_events(memcg, page); - if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { - mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); - } - /* - * Migration does not charge the res_counter for the - * replacement page, so leave it alone when phasing out the - * page that is unused after the migration. - */ - if (!end_migration && !mem_cgroup_is_root(memcg)) - mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - - return memcg; - -unlock_out: - unlock_page_cgroup(pc); - return NULL; -} - -void mem_cgroup_uncharge_page(struct page *page) -{ - /* early check. */ - if (page_mapped(page)) - return; - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - /* - * If the page is in swap cache, uncharge should be deferred - * to the swap path, which also properly accounts swap usage - * and handles memcg lifetime. - * - * Note that this check is not stable and reclaim may add the - * page to swap cache at any time after this. However, if the - * page is not in swap cache by the time page->mapcount hits - * 0, there won't be any page table references to the swap - * slot, and reclaim will free it and not actually write the - * page to disk. - */ - if (PageSwapCache(page)) - return; - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); -} - -void mem_cgroup_uncharge_cache_page(struct page *page) -{ - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping, page); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); -} - -/* - * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. - * In that cases, pages are freed continuously and we can expect pages - * are in the same memcg. All these calls itself limits the number of - * pages freed at once, then uncharge_start/end() is called properly. - * This may be called prural(2) times in a context, - */ - -void mem_cgroup_uncharge_start(void) -{ - current->memcg_batch.do_batch++; - /* We can do nest. */ - if (current->memcg_batch.do_batch == 1) { - current->memcg_batch.memcg = NULL; - current->memcg_batch.nr_pages = 0; - current->memcg_batch.memsw_nr_pages = 0; - } -} - -void mem_cgroup_uncharge_end(void) -{ - struct memcg_batch_info *batch = ¤t->memcg_batch; - - if (!batch->do_batch) - return; - - batch->do_batch--; - if (batch->do_batch) /* If stacked, do nothing. */ - return; - - if (!batch->memcg) - return; - /* - * This "batch->memcg" is valid without any css_get/put etc... - * bacause we hide charges behind us. - */ - if (batch->nr_pages) - res_counter_uncharge(&batch->memcg->res, - batch->nr_pages * PAGE_SIZE); - if (batch->memsw_nr_pages) - res_counter_uncharge(&batch->memcg->memsw, - batch->memsw_nr_pages * PAGE_SIZE); - memcg_oom_recover(batch->memcg); - /* forget this pointer (for sanity check) */ - batch->memcg = NULL; -} - -#ifdef CONFIG_SWAP -/* - * called after __delete_from_swap_cache() and drop "page" account. - * memcg information is recorded to swap_cgroup of "ent" - */ -void -mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) -{ - struct mem_cgroup *memcg; - int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; - - if (!swapout) /* this was a swap cache but the swap is unused ! */ - ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - - memcg = __mem_cgroup_uncharge_common(page, ctype, false); - - /* - * record memcg information, if swapout && memcg != NULL, - * css_get() was called in uncharge(). - */ - if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, mem_cgroup_id(memcg)); -} -#endif - #ifdef CONFIG_MEMCG_SWAP -/* - * called from swap_entry_free(). remove record in swap_cgroup and - * uncharge "memsw" account. - */ -void mem_cgroup_uncharge_swap(swp_entry_t ent) +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, + bool charge) { - struct mem_cgroup *memcg; - unsigned short id; - - if (!do_swap_account) - return; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { - /* - * We uncharge this because swap is freed. - * This memcg can be obsolete one. We avoid calling css_tryget - */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); - } - rcu_read_unlock(); + int val = (charge) ? 1 : -1; + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); } /** @@ -4438,175 +3648,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -/* - * Before starting migration, account PAGE_SIZE to mem_cgroup that the old - * page belongs to. - */ -void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp) -{ - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - enum charge_type ctype; - - *memcgp = NULL; - - if (mem_cgroup_disabled()) - return; - - if (PageTransHuge(page)) - nr_pages <<= compound_order(page); - - pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - css_get(&memcg->css); - /* - * At migrating an anonymous page, its mapcount goes down - * to 0 and uncharge() will be called. But, even if it's fully - * unmapped, migration may fail and this page has to be - * charged again. We set MIGRATION flag here and delay uncharge - * until end_migration() is called - * - * Corner Case Thinking - * A) - * When the old page was mapped as Anon and it's unmap-and-freed - * while migration was ongoing. - * If unmap finds the old page, uncharge() of it will be delayed - * until end_migration(). If unmap finds a new page, it's - * uncharged when it make mapcount to be 1->0. If unmap code - * finds swap_migration_entry, the new page will not be mapped - * and end_migration() will find it(mapcount==0). - * - * B) - * When the old page was mapped but migraion fails, the kernel - * remaps it. A charge for it is kept by MIGRATION flag even - * if mapcount goes down to 0. We can do remap successfully - * without charging it again. - * - * C) - * The "old" page is under lock_page() until the end of - * migration, so, the old page itself will not be swapped-out. - * If the new page is swapped out before end_migraton, our - * hook to usual swap-out path will catch the event. - */ - if (PageAnon(page)) - SetPageCgroupMigration(pc); - } - unlock_page_cgroup(pc); - /* - * If the page is not charged at this point, - * we return here. - */ - if (!memcg) - return; - - *memcgp = memcg; - /* - * We charge new page before it's used/mapped. So, even if unlock_page() - * is called before end_migration, we can catch all events on this new - * page. In the case new page is migrated but not remapped, new page's - * mapcount will be finally 0 and we call uncharge in end_migration(). - */ - if (PageAnon(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_ANON; - else - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - /* - * The page is committed to the memcg, but it's not actually - * charged to the res_counter since we plan on replacing the - * old one and only one page is going to be left afterwards. - */ - __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); -} - -/* remove redundant charge if migration failed*/ -void mem_cgroup_end_migration(struct mem_cgroup *memcg, - struct page *oldpage, struct page *newpage, bool migration_ok) -{ - struct page *used, *unused; - struct page_cgroup *pc; - bool anon; - - if (!memcg) - return; - - if (!migration_ok) { - used = oldpage; - unused = newpage; - } else { - used = newpage; - unused = oldpage; - } - anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE, - true); - css_put(&memcg->css); - /* - * We disallowed uncharge of pages under migration because mapcount - * of the page goes down to zero, temporarly. - * Clear the flag and check the page should be charged. - */ - pc = lookup_page_cgroup(oldpage); - lock_page_cgroup(pc); - ClearPageCgroupMigration(pc); - unlock_page_cgroup(pc); - - /* - * If a page is a file cache, radix-tree replacement is very atomic - * and we can skip this check. When it was an Anon page, its mapcount - * goes down to 0. But because we added MIGRATION flage, it's not - * uncharged yet. There are several case but page->mapcount check - * and USED bit check in mem_cgroup_uncharge_page() will do enough - * check. (see prepare_charge() also) - */ - if (anon) - mem_cgroup_uncharge_page(used); -} - -/* - * At replace page cache, newpage is not under any memcg but it's on - * LRU. So, this function doesn't touch res_counter but handles LRU - * in correct way. Both pages are locked so we cannot race with uncharge. - */ -void mem_cgroup_replace_page_cache(struct page *oldpage, - struct page *newpage) -{ - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(oldpage); - /* fix accounting on old pages */ - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, oldpage, false, -1); - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); - - /* - * When called from shmem_replace_page(), in some cases the - * oldpage has already been charged, and in some cases not. - */ - if (!memcg) - return; - /* - * Even if newpage->mapping was NULL before starting replacement, - * the newpage may be on LRU(or pagevec for LRU) already. We lock - * LRU while we overwrite pc->mem_cgroup. - */ - __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); -} - #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@ -4805,7 +3846,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); /* * If we failed to reclaim anything from this memory cgroup @@ -4833,7 +3874,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; } while (1); } - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); excess = res_counter_soft_limit_excess(&mz->memcg->res); /* * One school of thought says that we should not add @@ -4844,8 +3885,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * term TODO. */ /* If excess == 0, no tree ops */ - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); - spin_unlock(&mctz->lock); + __mem_cgroup_insert_exceeded(mz, mctz, excess); + spin_unlock_irq(&mctz->lock); css_put(&mz->memcg->css); loop++; /* @@ -4911,9 +3952,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, if (mem_cgroup_move_parent(page, pc, memcg)) { /* found lock contention or "pc" is obsolete. */ busy = page; - cond_resched(); } else busy = NULL; + cond_resched(); } while (!list_empty(list)); } @@ -4964,18 +4005,28 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) } while (usage > 0); } +/* + * Test whether @memcg has children, dead or alive. Note that this + * function doesn't care whether @memcg has use_hierarchy enabled and + * returns %true if there are child csses according to the cgroup + * hierarchy. Testing use_hierarchy is the caller's responsiblity. + */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { - lockdep_assert_held(&memcg_create_mutex); + bool ret; + /* - * The lock does not prevent addition or deletion to the list - * of children, but it prevents a new child from being - * initialized based on this parent in css_online(), so it's - * enough to decide whether hierarchically inherited - * attributes can still be changed or not. + * The lock does not prevent addition or deletion of children, but + * it prevents a new child from being initialized based on this + * parent in css_online(), so it's enough to decide whether + * hierarchically inherited attributes can still be changed or not. */ - return memcg->use_hierarchy && - !list_empty(&memcg->css.cgroup->children); + lockdep_assert_held(&memcg_create_mutex); + + rcu_read_lock(); + ret = css_next_child(NULL, &memcg->css); + rcu_read_unlock(); + return ret; } /* @@ -4987,11 +4038,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) static int mem_cgroup_force_empty(struct mem_cgroup *memcg) { int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct cgroup *cgrp = memcg->css.cgroup; - - /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) - return -EBUSY; /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); @@ -5011,20 +4057,19 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) } } - lru_add_drain(); - mem_cgroup_reparent_charges(memcg); return 0; } -static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, - unsigned int event) +static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); if (mem_cgroup_is_root(memcg)) return -EINVAL; - return mem_cgroup_force_empty(memcg); + return mem_cgroup_force_empty(memcg) ?: nbytes; } static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, @@ -5038,7 +4083,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, { int retval = 0; struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); mutex_lock(&memcg_create_mutex); @@ -5055,7 +4100,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, */ if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { - if (list_empty(&memcg->css.cgroup->children)) + if (!memcg_has_children(memcg)) memcg->use_hierarchy = val; else retval = -EBUSY; @@ -5068,7 +4113,6 @@ out: return retval; } - static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { @@ -5108,38 +4152,29 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val << PAGE_SHIFT; } + static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return mem_cgroup_usage(memcg, false); + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return mem_cgroup_usage(memcg, true); + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; } #ifdef CONFIG_MEMCG_KMEM @@ -5172,7 +4207,8 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * of course permitted. */ mutex_lock(&memcg_create_mutex); - if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) + if (cgroup_has_tasks(memcg->css.cgroup) || + (memcg->use_hierarchy && memcg_has_children(memcg))) err = -EBUSY; mutex_unlock(&memcg_create_mutex); if (err) @@ -5189,13 +4225,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * Make sure we have enough space for this cgroup in each root cache's * memcg_params. */ + mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(memcg_id + 1); + mutex_unlock(&memcg_slab_mutex); if (err) goto out_rmid; memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); - mutex_init(&memcg->slab_caches_mutex); /* * We couldn't have accounted to this cgroup, because it hasn't got the @@ -5273,17 +4310,18 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, * The user of this function is... * RES_LIMIT. */ -static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) +static ssize_t mem_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); enum res_type type; int name; unsigned long long val; int ret; - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + buf = strstrip(buf); + type = MEMFILE_TYPE(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_LIMIT: @@ -5292,7 +4330,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, break; } /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; if (type == _MEM) @@ -5305,7 +4343,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL; break; case RES_SOFT_LIMIT: - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; /* @@ -5322,7 +4360,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, ret = -EINVAL; /* should be BUG() ? */ break; } - return ret; + return ret ?: nbytes; } static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, @@ -5335,8 +4373,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, if (!memcg->use_hierarchy) goto out; - while (css_parent(&memcg->css)) { - memcg = mem_cgroup_from_css(css_parent(&memcg->css)); + while (memcg->css.parent) { + memcg = mem_cgroup_from_css(memcg->css.parent); if (!memcg->use_hierarchy) break; tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5349,14 +4387,15 @@ out: *memsw_limit = min_memsw_limit; } -static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); int name; enum res_type type; - type = MEMFILE_TYPE(event); - name = MEMFILE_ATTR(event); + type = MEMFILE_TYPE(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_MAX_USAGE: @@ -5381,7 +4420,7 @@ static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) break; } - return 0; + return nbytes; } static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, @@ -5540,7 +4579,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; rstat = &mz->lruvec.reclaim_stat; recent_rotated[0] += rstat->recent_rotated[0]; @@ -5570,22 +4609,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - - if (val > 100 || !parent) - return -EINVAL; - - mutex_lock(&memcg_create_mutex); - /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); + if (val > 100) return -EINVAL; - } - memcg->swappiness = val; - - mutex_unlock(&memcg_create_mutex); + if (css->parent) + memcg->swappiness = val; + else + vm_swappiness = val; return 0; } @@ -5670,8 +4701,12 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) { struct mem_cgroup_eventfd_list *ev; + spin_lock(&memcg_oom_lock); + list_for_each_entry(ev, &memcg->oom_notify, list) eventfd_signal(ev->eventfd, 1); + + spin_unlock(&memcg_oom_lock); return 0; } @@ -5697,15 +4732,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = mem_cgroup_usage(memcg, true); + } else BUG(); - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -5785,18 +4820,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, int i, j, size; mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = mem_cgroup_usage(memcg, true); + } else BUG(); if (!thresholds->primary) goto unlock; - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -5917,22 +4953,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!parent || !((val == 0) || (val == 1))) + if (!css->parent || !((val == 0) || (val == 1))) return -EINVAL; - mutex_lock(&memcg_create_mutex); - /* oom-kill-disable is a flag for subhierarchy. */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - mutex_unlock(&memcg_create_mutex); + return 0; } @@ -5972,10 +5001,10 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) * which is then paired with css_put during uncharge resp. here. * * Although this might sound strange as this path is called from - * css_offline() when the referencemight have dropped down to 0 - * and shouldn't be incremented anymore (css_tryget would fail) - * we do not have other options because of the kmem allocations - * lifetime. + * css_offline() when the referencemight have dropped down to 0 and + * shouldn't be incremented anymore (css_tryget_online() would + * fail) we do not have other options because of the kmem + * allocations lifetime. */ css_get(&memcg->css); @@ -6094,9 +5123,10 @@ static void memcg_event_ptable_queue_proc(struct file *file, * Input must be in format '<event_fd> <control_fd> <args>'. * Interpretation of args is defined by control file implementation. */ -static int memcg_write_event_control(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { + struct cgroup_subsys_state *css = of_css(of); struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event; struct cgroup_subsys_state *cfile_css; @@ -6107,15 +5137,17 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, char *endp; int ret; - efd = simple_strtoul(buffer, &endp, 10); + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); if (*endp != ' ') return -EINVAL; - buffer = endp + 1; + buf = endp + 1; - cfd = simple_strtoul(buffer, &endp, 10); + cfd = simple_strtoul(buf, &endp, 10); if ((*endp != ' ') && (*endp != '\0')) return -EINVAL; - buffer = endp + 1; + buf = endp + 1; event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) @@ -6183,19 +5215,17 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - rcu_read_lock(); - + cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, + &memory_cgrp_subsys); ret = -EINVAL; - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, - &mem_cgroup_subsys); - if (cfile_css == css && css_tryget(css)) - ret = 0; - - rcu_read_unlock(); - if (ret) + if (IS_ERR(cfile_css)) + goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); goto out_put_cfile; + } - ret = event->register_event(memcg, event->eventfd, buffer); + ret = event->register_event(memcg, event->eventfd, buf); if (ret) goto out_put_css; @@ -6208,7 +5238,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, fdput(cfile); fdput(efile); - return 0; + return nbytes; out_put_css: css_put(css); @@ -6233,25 +5263,25 @@ static struct cftype mem_cgroup_files[] = { { .name = "max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { @@ -6260,17 +5290,16 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "force_empty", - .trigger = mem_cgroup_force_empty_write, + .write = mem_cgroup_force_empty_write, }, { .name = "use_hierarchy", - .flags = CFTYPE_INSANE, .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, { .name = "cgroup.event_control", /* XXX: for compat */ - .write_string = memcg_write_event_control, + .write = memcg_write_event_control, .flags = CFTYPE_NO_PREFIX, .mode = S_IWUGO, }, @@ -6303,7 +5332,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { @@ -6314,13 +5343,13 @@ static struct cftype mem_cgroup_files[] = { { .name = "kmem.failcnt", .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, #ifdef CONFIG_SLABINFO @@ -6343,19 +5372,19 @@ static struct cftype memsw_cgroup_files[] = { { .name = "memsw.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.limit_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.failcnt", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { }, /* terminate */ @@ -6533,9 +5562,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); + struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); + int ret; - if (css->cgroup->id > MEM_CGROUP_ID_MAX) + if (css->id > MEM_CGROUP_ID_MAX) return -ENOSPC; if (!parent) @@ -6566,11 +5596,22 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * unfortunate state in our controller. */ if (parent != root_mem_cgroup) - mem_cgroup_subsys.broken_hierarchy = true; + memory_cgrp_subsys.broken_hierarchy = true; } mutex_unlock(&memcg_create_mutex); - return memcg_init_kmem(memcg, &mem_cgroup_subsys); + ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); + if (ret) + return ret; + + /* + * Make sure the memcg is initialized: mem_cgroup_iter() + * orders reading memcg->initialized against its callers + * reading the memcg members. + */ + smp_store_release(&memcg->initialized, 1); + + return 0; } /* @@ -6620,7 +5661,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) css_for_each_descendant_post(iter, css) mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); - mem_cgroup_destroy_all_caches(memcg); + memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } @@ -6630,7 +5671,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) /* * XXX: css_offline() would be where we should reparent all * memory to prepare the cgroup for destruction. However, - * memcg does not do css_tryget() and res_counter charging + * memcg does not do css_tryget_online() and res_counter charging * under the same RCU lock region, which means that charging * could race with offlining. Offlining only happens to * cgroups with no tasks in them but charges can show up @@ -6644,9 +5685,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) * lookup_swap_cgroup_id() * rcu_read_lock() * mem_cgroup_lookup() - * css_tryget() + * css_tryget_online() * rcu_read_unlock() - * disable css_tryget() + * disable css_tryget_online() * call_rcu() * offline_css() * reparent_charges() @@ -6668,58 +5709,63 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) __mem_cgroup_free(memcg); } +/** + * mem_cgroup_css_reset - reset the states of a mem_cgroup + * @css: the target css + * + * Reset the states of the mem_cgroup associated with @css. This is + * invoked when the userland requests disabling on the default hierarchy + * but the memcg is pinned through dependency. The memcg should stop + * applying policies and should revert to the vanilla state as it may be + * made visible again. + * + * The current implementation only resets the essential configurations. + * This needs to be expanded to cover all the visible parts. + */ +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_resize_limit(memcg, ULLONG_MAX); + mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); + memcg_update_kmem_limit(memcg, ULLONG_MAX); + res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); +} + #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ -#define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; + int ret; - if (mem_cgroup_is_root(memcg)) { + /* Try a single bulk charge without reclaim first */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; - /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } - mc.precharge += count; + if (ret == -EINTR) { + cancel_charge(root_mem_cgroup, count); return ret; } -one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = __mem_cgroup_try_charge(NULL, - GFP_KERNEL, 1, &memcg, false); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. + */ + if (ret == -EINTR) + cancel_charge(root_mem_cgroup, 1); if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; } /** @@ -6817,16 +5863,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - page = find_get_page(mapping, pgoff); - #ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - if (do_swap_account) - *entry = swap; - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + if (do_swap_account) + *entry = swp; + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif return page; } @@ -6851,9 +5901,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (page) { pc = lookup_page_cgroup(page); /* - * Do only loose check w/o page_cgroup lock. - * mem_cgroup_move_account() checks the pc is valid or not under - * the lock. + * Do only loose check w/o serialization. + * mem_cgroup_move_account() checks the pc is valid or + * not under LRU exclusion. */ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; @@ -6978,7 +6028,7 @@ static void __mem_cgroup_clear_mc(void) /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - __mem_cgroup_cancel_charge(mc.to, mc.precharge); + cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@ -6986,7 +6036,7 @@ static void __mem_cgroup_clear_mc(void) * we must uncharge here. */ if (mc.moved_charge) { - __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ @@ -6994,19 +6044,18 @@ static void __mem_cgroup_clear_mc(void) /* uncharge swap account from the old cgroup */ if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + PAGE_SIZE * mc.moved_swap); for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css); - if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + if (!mem_cgroup_is_root(mc.to)) res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } @@ -7259,31 +6308,31 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, /* * Cgroup retains root cgroups across [un]mount cycles making it necessary - * to verify sane_behavior flag on each mount attempt. + * to verify whether we're attached to the default hierarchy on each mount + * attempt. */ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) { /* - * use_hierarchy is forced with sane_behavior. cgroup core + * use_hierarchy is forced on the default hierarchy. cgroup core * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_sane_behavior(root_css->cgroup)) + if (cgroup_on_dfl(root_css->cgroup)) mem_cgroup_from_css(root_css)->use_hierarchy = true; } -struct cgroup_subsys mem_cgroup_subsys = { - .name = "memory", - .subsys_id = mem_cgroup_subsys_id, +struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, .css_free = mem_cgroup_css_free, + .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, - .base_cftypes = mem_cgroup_files, + .legacy_cftypes = mem_cgroup_files, .early_init = 0, }; @@ -7300,7 +6349,8 @@ __setup("swapaccount=", enable_swap_account); static void __init memsw_file_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memsw_cgroup_files)); } static void __init enable_swap_cgroup(void) @@ -7317,6 +6367,402 @@ static void __init enable_swap_cgroup(void) } #endif +#ifdef CONFIG_MEMCG_SWAP +/** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ +void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ + struct page_cgroup *pc; + unsigned short oldid; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!do_swap_account) + return; + + pc = lookup_page_cgroup(page); + + /* Readahead page, never charged */ + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); + + oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); + VM_BUG_ON_PAGE(oldid, page); + + pc->flags &= ~PCG_MEMSW; + css_get(&pc->mem_cgroup->css); + mem_cgroup_swap_statistics(pc->mem_cgroup, true); +} + +/** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ +void mem_cgroup_uncharge_swap(swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + + if (!do_swap_account) + return; + + id = swap_cgroup_record(entry, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); + if (memcg) { + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } + rcu_read_unlock(); +} +#endif + +/** + * mem_cgroup_try_charge - try charging a page + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @memcgp: charged memcg return + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success, with *@memcgp pointing to the charged memcg. + * Otherwise, an error code is returned. + * + * After page->mapping has been set up, the caller must finalize the + * charge with mem_cgroup_commit_charge(). Or abort the transaction + * with mem_cgroup_cancel_charge() in case page instantiation fails. + */ +int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp) +{ + struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; + int ret = 0; + + if (mem_cgroup_disabled()) + goto out; + + if (PageSwapCache(page)) { + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. + */ + if (PageCgroupUsed(pc)) + goto out; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + if (do_swap_account && PageSwapCache(page)) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) + memcg = get_mem_cgroup_from_mm(mm); + + ret = try_charge(memcg, gfp_mask, nr_pages); + + css_put(&memcg->css); + + if (ret == -EINTR) { + memcg = root_mem_cgroup; + ret = 0; + } +out: + *memcgp = memcg; + return ret; +} + +/** + * mem_cgroup_commit_charge - commit a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * @lrucare: page might be on LRU already + * + * Finalize a charge transaction started by mem_cgroup_try_charge(), + * after page->mapping has been set up. This must happen atomically + * as part of the page instantiation, i.e. under the page table lock + * for anonymous pages, under the page lock for page and swap cache. + * + * In addition, the page must not be on the LRU during the commit, to + * prevent racing with task migration. If it might be, use @lrucare. + * + * Use mem_cgroup_cancel_charge() to cancel the transaction instead. + */ +void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) +{ + unsigned int nr_pages = 1; + + VM_BUG_ON_PAGE(!page->mapping, page); + VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + commit_charge(page, memcg, lrucare); + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, page, nr_pages); + memcg_check_events(memcg, page); + local_irq_enable(); + + if (do_swap_account && PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry); + } +} + +/** + * mem_cgroup_cancel_charge - cancel a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * + * Cancel a charge transaction started by mem_cgroup_try_charge(). + */ +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +{ + unsigned int nr_pages = 1; + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + cancel_charge(memcg, nr_pages); +} + +static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_mem, unsigned long nr_memsw, + unsigned long nr_anon, unsigned long nr_file, + unsigned long nr_huge, struct page *dummy_page) +{ + unsigned long flags; + + if (!mem_cgroup_is_root(memcg)) { + if (nr_mem) + res_counter_uncharge(&memcg->res, + nr_mem * PAGE_SIZE); + if (nr_memsw) + res_counter_uncharge(&memcg->memsw, + nr_memsw * PAGE_SIZE); + memcg_oom_recover(memcg); + } + + local_irq_save(flags); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + memcg_check_events(memcg, dummy_page); + local_irq_restore(flags); +} + +static void uncharge_list(struct list_head *page_list) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr_memsw = 0; + unsigned long nr_anon = 0; + unsigned long nr_file = 0; + unsigned long nr_huge = 0; + unsigned long pgpgout = 0; + unsigned long nr_mem = 0; + struct list_head *next; + struct page *page; + + next = page_list->next; + do { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + page = list_entry(next, struct page, lru); + next = page->lru.next; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + continue; + + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point, we have + * fully exclusive access to the page. + */ + + if (memcg != pc->mem_cgroup) { + if (memcg) { + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); + pgpgout = nr_mem = nr_memsw = 0; + nr_anon = nr_file = nr_huge = 0; + } + memcg = pc->mem_cgroup; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + nr_huge += nr_pages; + } + + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + + if (pc->flags & PCG_MEM) + nr_mem += nr_pages; + if (pc->flags & PCG_MEMSW) + nr_memsw += nr_pages; + pc->flags = 0; + + pgpgout++; + } while (next != page_list); + + if (memcg) + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); +} + +/** + * mem_cgroup_uncharge - uncharge a page + * @page: page to uncharge + * + * Uncharge a page previously charged with mem_cgroup_try_charge() and + * mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge(struct page *page) +{ + struct page_cgroup *pc; + + if (mem_cgroup_disabled()) + return; + + /* Don't touch page->lru of any random page, pre-check: */ + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + return; + + INIT_LIST_HEAD(&page->lru); + uncharge_list(&page->lru); +} + +/** + * mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; + + if (!list_empty(page_list)) + uncharge_list(page_list); +} + +/** + * mem_cgroup_migrate - migrate a charge to another page + * @oldpage: currently charged page + * @newpage: page to transfer the charge to + * @lrucare: both pages might be on the LRU already + * + * Migrate the charge from @oldpage to @newpage. + * + * Both pages must be locked, @newpage->mapping must be set up. + */ +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, + bool lrucare) +{ + struct page_cgroup *pc; + int isolated; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); + VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); + VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), + newpage); + + if (mem_cgroup_disabled()) + return; + + /* Page cache replacement: new page already charged? */ + pc = lookup_page_cgroup(newpage); + if (PageCgroupUsed(pc)) + return; + + /* Re-entrant migration: old page already uncharged? */ + pc = lookup_page_cgroup(oldpage); + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); + VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); + + if (lrucare) + lock_page_lru(oldpage, &isolated); + + pc->flags = 0; + + if (lrucare) + unlock_page_lru(oldpage, isolated); + + commit_charge(newpage, pc->mem_cgroup, lrucare); +} + /* * subsys_initcall() for memory controller. * diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 90002ea4363..44c6bd201d3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p) return -EINVAL; css = mem_cgroup_css(mem); - /* root_mem_cgroup has NULL dentries */ - if (!css->cgroup->dentry) - return -EINVAL; - - ino = css->cgroup->dentry->d_inode->i_ino; + ino = cgroup_ino(css->cgroup); css_put(css); - if (ino != hwpoison_filter_memcg) + if (!ino || ino != hwpoison_filter_memcg) return -EINVAL; return 0; @@ -208,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #endif si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; - if ((flags & MF_ACTION_REQUIRED) && t == current) { + if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { si.si_code = BUS_MCEERR_AR; - ret = force_sig_info(SIGBUS, &si, t); + ret = force_sig_info(SIGBUS, &si, current); } else { /* * Don't use force here, it's convenient if the signal @@ -384,20 +380,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, } } -static int task_early_kill(struct task_struct *tsk) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) { + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) +{ + struct task_struct *t; if (!tsk->mm) - return 0; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; + return NULL; + if (force_early) + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; } /* * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -408,20 +435,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff = page_to_pgoff(page); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk)) + if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -432,7 +460,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -441,11 +469,11 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, mutex_lock(&mapping->i_mmap_mutex); read_lock(&tasklist_lock); for_each_process(tsk) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_to_pgoff(page); + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk)) + if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* @@ -455,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -469,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * First preallocate one tokill structure outside the spin locks, * so that we can kill at least one process reasonably reliable. */ -static void collect_procs(struct page *page, struct list_head *tokill) +static void collect_procs(struct page *page, struct list_head *tokill, + int force_early) { struct to_kill *tk; @@ -480,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill) if (!tk) return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk); + collect_procs_anon(page, tokill, &tk, force_early); else - collect_procs_file(page, tokill, &tk); + collect_procs_file(page, tokill, &tk, force_early); kfree(tk); } @@ -866,8 +895,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct page *hpage = *hpagep; struct page *ppage; + /* + * Here we are interested only in user-mapped pages, so skip any + * other types of pages. + */ if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; + if (!(PageLRU(hpage) || PageHuge(p))) + return SWAP_SUCCESS; /* * This check implies we don't kill processes if their pages @@ -876,8 +911,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (!page_mapped(hpage)) return SWAP_SUCCESS; - if (PageKsm(p)) + if (PageKsm(p)) { + pr_err("MCE %#lx: can't handle KSM pages.\n", pfn); return SWAP_FAIL; + } if (PageSwapCache(p)) { printk(KERN_ERR @@ -967,7 +1004,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill); + collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) @@ -1085,15 +1122,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } else if (PageHuge(hpage)) { /* - * Check "just unpoisoned", "filter hit", and - * "race with other subpage." + * Check "filter hit" and "race with other subpage." */ lock_page(hpage); - if (!PageHWPoison(hpage) - || (hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &num_poisoned_pages); - return 0; + if (PageHWPoison(hpage)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); + return 0; + } } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); @@ -1129,18 +1167,20 @@ int memory_failure(unsigned long pfn, int trapno, int flags) action_result(pfn, "free buddy, 2nd try", DELAYED); return 0; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } } + lock_page(hpage); + /* - * Lock the page and wait for writeback to finish. - * It's very difficult to mess with pages currently under IO - * and in many cases impossible, so we just avoid it here. + * The page could have changed compound pages during the locking. + * If this happens just bail out. */ - lock_page(hpage); + if (compound_head(p) != hpage) { + action_result(pfn, "different compound page after locking", IGNORED); + res = -EBUSY; + goto out; + } /* * We use page flags to determine what action should be taken, but @@ -1156,6 +1196,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + atomic_long_sub(nr_pages, &num_poisoned_pages); + put_page(hpage); res = 0; goto out; } @@ -1167,6 +1209,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } + if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + goto identify_page_state; + /* * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned @@ -1187,6 +1232,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (PageHuge(p)) set_page_hwpoison_huge_page(hpage); + /* + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ wait_on_page_writeback(p); /* @@ -1198,7 +1247,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) != SWAP_SUCCESS) { - printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + action_result(pfn, "unmapping failed", IGNORED); res = -EBUSY; goto out; } @@ -1212,6 +1261,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) goto out; } +identify_page_state: res = -EBUSY; /* * The first check uses the current page flags which may not have any @@ -1299,7 +1349,7 @@ static void memory_failure_work_func(struct work_struct *work) unsigned long proc_flags; int gotten; - mf_cpu = &__get_cpu_var(memory_failure_cpu); + mf_cpu = this_cpu_ptr(&memory_failure_cpu); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); @@ -1504,7 +1554,7 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_move(&hpage->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", @@ -1585,7 +1635,7 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { if (!list_empty(&pagelist)) { @@ -1665,11 +1715,7 @@ int soft_offline_page(struct page *page, int flags) } } - /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); + get_online_mems(); /* * Isolate the page, so that it doesn't get reallocated if it @@ -1680,7 +1726,7 @@ int soft_offline_page(struct page *page, int flags) set_migratetype_isolate(page, true); ret = get_any_page(page, pfn, flags); - unlock_memory_hotplug(); + put_online_mems(); if (ret > 0) { /* for in-use pages */ if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); diff --git a/mm/memory.c b/mm/memory.c index 22dfa617bdd..e229970e422 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -60,6 +60,7 @@ #include <linux/migrate.h> #include <linux/string.h> #include <linux/dma-debug.h> +#include <linux/debugfs.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -117,6 +118,8 @@ __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; unsigned long highest_memmap_pfn __read_mostly; +EXPORT_SYMBOL(zero_pfn); + /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -231,17 +234,18 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long #endif } -void tlb_flush_mmu(struct mmu_gather *tlb) +static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { - struct mmu_gather_batch *batch; - - if (!tlb->need_flush) - return; tlb->need_flush = 0; tlb_flush(tlb); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif +} + +static void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; for (batch = &tlb->local; batch; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); @@ -250,6 +254,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb) tlb->active = &tlb->local; } +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + if (!tlb->need_flush) + return; + tlb_flush_mmu_tlbonly(tlb); + tlb_flush_mmu_free(tlb); +} + /* tlb_finish_mmu * Called at the end of the shootdown operation to free up any resources * that were required. @@ -688,11 +700,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } -static inline bool is_cow_mapping(vm_flags_t flags) -{ - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -878,7 +885,7 @@ out_set_pte: return 0; } -int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { @@ -1120,14 +1127,16 @@ again: addr) != page->index) { pte_t ptfile = pgoff_to_pte(page->index); if (pte_soft_dirty(ptent)) - pte_file_mksoft_dirty(ptfile); + ptfile = pte_file_mksoft_dirty(ptfile); set_pte_at(mm, addr, pte, ptfile); } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { - if (pte_dirty(ptent)) + if (pte_dirty(ptent)) { + force_flush = 1; set_page_dirty(page); + } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); @@ -1136,9 +1145,10 @@ again: page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (unlikely(!__tlb_remove_page(tlb, page))) { + force_flush = 1; break; + } continue; } /* @@ -1173,18 +1183,11 @@ again: add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - /* - * mmu_gather ran out of room to batch pages, we break out of - * the PTE lock to avoid doing the potential expensive TLB invalidate - * and page-free while holding it. - */ + /* Do the actual TLB flush before dropping ptl */ if (force_flush) { unsigned long old_end; - force_flush = 0; - /* * Flush the TLB just for the previous segment, * then update the range to be the remaining @@ -1192,11 +1195,21 @@ again: */ old_end = tlb->end; tlb->end = addr; - - tlb_flush_mmu(tlb); - + tlb_flush_mmu_tlbonly(tlb); tlb->start = addr; tlb->end = old_end; + } + pte_unmap_unlock(start_pte, ptl); + + /* + * If we forced a TLB flush (either due to running out of + * batch buffers or because we needed to flush dirty TLB + * entries before releasing the ptl), free the batched + * memory too. Restart if we didn't do everything. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu_free(tlb); if (addr != end) goto again; @@ -1280,7 +1293,6 @@ static void unmap_page_range(struct mmu_gather *tlb, details = NULL; BUG_ON(addr >= end); - mem_cgroup_uncharge_start(); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -1290,7 +1302,6 @@ static void unmap_page_range(struct mmu_gather *tlb, next = zap_pud_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); - mem_cgroup_uncharge_end(); } @@ -1320,9 +1331,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When + * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file + * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. @@ -1441,617 +1452,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in <linux/mm.h> - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - goto out; - } - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; - - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - goto no_page_table; - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - if (flags & FOLL_GET) - goto out; - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - goto out; - } - if (unlikely(pud_bad(*pud))) - goto no_page_table; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto no_page_table; - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - if (flags & FOLL_GET) { - /* - * Refcount on tail pages are not well-defined and - * shouldn't be taken. The caller should handle a NULL - * return when trying to follow tail pages. - */ - if (PageHead(page)) - get_page(page); - else { - page = NULL; - goto out; - } - } - goto out; - } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) - goto no_page_table; - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - goto split_fallthrough; - } - ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - goto out; - } - } else - spin_unlock(ptl); - /* fall through */ - } -split_fallthrough: - if (unlikely(pmd_bad(*pmd))) - goto no_page_table; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - - pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte) || pte_file(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto split_fallthrough; - } - if ((flags & FOLL_NUMA) && pte_numa(pte)) - goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { - if ((flags & FOLL_DUMP) || - !is_zero_pfn(pte_pfn(pte))) - goto bad_page; - page = pte_page(pte); - } - - if (flags & FOLL_GET) - get_page_foll(page); - if (flags & FOLL_TOUCH) { - if ((flags & FOLL_WRITE) && - !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - /* - * pte_mkyoung() would be more correct here, but atomic care - * is needed to avoid losing the dirty bit: it is easier to use - * mark_page_accessed(). - */ - mark_page_accessed(page); - } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * The preliminary mapping check is mainly to avoid the - * pointless overhead of lock_page on the ZERO_PAGE - * which might bounce very badly if there is contention. - * - * If the page is already locked, we don't need to - * handle it now - vmscan will handle it later if and - * when it attempts to reclaim the page. - */ - if (page->mapping && trylock_page(page)) { - lru_add_drain(); /* push cached pages to LRU */ - /* - * Because we lock page here, and migration is - * blocked by the pte's page reference, and we - * know the page is still mapped, we don't even - * need to check for file-cache page truncation. - */ - mlock_vma_page(page); - unlock_page(page); - } - } -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return page; - -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - -no_page: - pte_unmap_unlock(ptep, ptl); - if (!pte_none(pte)) - return page; - -no_page_table: - /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate unnecessary pages or - * page tables. Return error instead of NULL to skip handle_mm_fault, - * then get_dump_page() will return NULL to leave a hole in the dump. - * But we can only make this optimization where a hole would surely - * be zero-filled if handle_mm_fault() actually did handle it. - */ - if ((flags & FOLL_DUMP) && - (!vma->vm_ops || !vma->vm_ops->fault)) - return ERR_PTR(-EFAULT); - return page; -} - -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return stack_guard_page_start(vma, addr) || - stack_guard_page_end(vma, addr+PAGE_SIZE); -} - -/** - * __get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * @nonblocking: whether waiting for disk IO or mmap_sem contention - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * __get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * __get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If - * the page is written to, set_page_dirty (or set_page_dirty_lock, as - * appropriate) must be called after the page is finished with, and - * before put_page is called. - * - * If @nonblocking != NULL, __get_user_pages will not wait for disk IO - * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. - * - * In most cases, get_user_pages or get_user_pages_fast should be used - * instead of __get_user_pages. __get_user_pages should be used only if - * you need some special @gup_flags. - */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) -{ - long i; - unsigned long vm_flags; - unsigned int page_mask; - - if (!nr_pages) - return 0; - - VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - - /* - * Require read or write permissions. - * If FOLL_FORCE is set, we only require the "MAY" flags. - */ - vm_flags = (gup_flags & FOLL_WRITE) ? - (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= (gup_flags & FOLL_FORCE) ? - (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - - /* - * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault - * would be called on PROT_NONE ranges. We must never invoke - * handle_mm_fault on PROT_NONE ranges or the NUMA hinting - * page faults would unprotect the PROT_NONE ranges if - * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd - * bitflag. So to avoid that, don't set FOLL_NUMA if - * FOLL_FORCE is set. - */ - if (!(gup_flags & FOLL_FORCE)) - gup_flags |= FOLL_NUMA; - - i = 0; - - do { - struct vm_area_struct *vma; - - vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(mm, start)) { - unsigned long pg = start & PAGE_MASK; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* user gate pages are read-only */ - if (gup_flags & FOLL_WRITE) - return i ? : -EFAULT; - if (pg > TASK_SIZE) - pgd = pgd_offset_k(pg); - else - pgd = pgd_offset_gate(mm, pg); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, pg); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, pg); - if (pmd_none(*pmd)) - return i ? : -EFAULT; - VM_BUG_ON(pmd_trans_huge(*pmd)); - pte = pte_offset_map(pmd, pg); - if (pte_none(*pte)) { - pte_unmap(pte); - return i ? : -EFAULT; - } - vma = get_gate_vma(mm); - if (pages) { - struct page *page; - - page = vm_normal_page(vma, start, *pte); - if (!page) { - if (!(gup_flags & FOLL_DUMP) && - is_zero_pfn(pte_pfn(*pte))) - page = pte_page(*pte); - else { - pte_unmap(pte); - return i ? : -EFAULT; - } - } - pages[i] = page; - get_page(page); - } - pte_unmap(pte); - page_mask = 0; - goto next_page; - } - - if (!vma || - (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - !(vm_flags & vma->vm_flags)) - return i ? : -EFAULT; - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, gup_flags); - continue; - } - - do { - struct page *page; - unsigned int foll_flags = gup_flags; - unsigned int page_increm; - - /* - * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory. - */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; - - cond_resched(); - while (!(page = follow_page_mask(vma, start, - foll_flags, &page_mask))) { - int ret; - unsigned int fault_flags = 0; - - /* For mlock, just skip the stack guard page. */ - if (foll_flags & FOLL_MLOCK) { - if (stack_guard_page(vma, start)) - goto next_page; - } - if (foll_flags & FOLL_WRITE) - fault_flags |= FAULT_FLAG_WRITE; - if (nonblocking) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; - if (foll_flags & FOLL_NOWAIT) - fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); - - ret = handle_mm_fault(mm, vma, start, - fault_flags); - - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return i ? i : -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | - VM_FAULT_HWPOISON_LARGE)) { - if (i) - return i; - else if (gup_flags & FOLL_HWPOISON) - return -EHWPOISON; - else - return -EFAULT; - } - if (ret & VM_FAULT_SIGBUS) - return i ? i : -EFAULT; - BUG(); - } - - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - - if (ret & VM_FAULT_RETRY) { - if (nonblocking) - *nonblocking = 0; - return i; - } - - /* - * The VM_FAULT_WRITE bit tells us that - * do_wp_page has broken COW when necessary, - * even if maybe_mkwrite decided not to set - * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. But only - * do so when looping for pte_write is futile: - * in some cases userspace may also be wanting - * to write to the gotten user page, which a - * read fault here might prevent (a readonly - * page might get reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && - !(vma->vm_flags & VM_WRITE)) - foll_flags &= ~FOLL_WRITE; - - cond_resched(); - } - if (IS_ERR(page)) - return i ? i : PTR_ERR(page); - if (pages) { - pages[i] = page; - - flush_anon_page(vma, page, start); - flush_dcache_page(page); - page_mask = 0; - } -next_page: - if (vmas) { - vmas[i] = vma; - page_mask = 0; - } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); - if (page_increm > nr_pages) - page_increm = nr_pages; - i += page_increm; - start += page_increm * PAGE_SIZE; - nr_pages -= page_increm; - } while (nr_pages && start < vma->vm_end); - } while (nr_pages); - return i; -} -EXPORT_SYMBOL(__get_user_pages); - -/* - * fixup_user_fault() - manually resolve a user page fault - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @address: user address - * @fault_flags:flags to pass down to handle_mm_fault() - * - * This is meant to be called in the specific scenario where for locking reasons - * we try to access user memory in atomic context (within a pagefault_disable() - * section), this returns -EFAULT, and we want to resolve the user fault before - * trying again. - * - * Typically this is meant to be used by the futex code. - * - * The main difference with get_user_pages() is that this function will - * unconditionally call handle_mm_fault() which will in turn perform all the - * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. - * - * This is important for some architectures where those bits also gate the - * access permission to the page because they are maintained in software. On - * such architectures, gup() will not be enough to make a subsequent access - * succeed. - * - * This should be called with the mm_sem held for read. - */ -int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) -{ - struct vm_area_struct *vma; - int ret; - - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) - return -EFAULT; - - ret = handle_mm_fault(mm, vma, address, fault_flags); - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return -EHWPOISON; - if (ret & VM_FAULT_SIGBUS) - return -EFAULT; - BUG(); - } - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - return 0; -} - -/* - * get_user_pages() - pin user pages in memory - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force write access even if user mapping is - * readonly. This will result in the page being COWed even - * in MAP_SHARED mappings. You do not want this. - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. - * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. - * - * See also get_user_pages_fast, for performance critical applications. - */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) -{ - int flags = FOLL_TOUCH; - - if (pages) - flags |= FOLL_GET; - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); -} -EXPORT_SYMBOL(get_user_pages); - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by page_cache_release() or put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_sem, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; -} -#endif /* CONFIG_ELF_CORE */ - pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { @@ -2587,6 +1987,38 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo } /* + * Notify the address space that the page is about to become writable so that + * it can prohibit this or wait for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can sleep if it needs to. + */ +static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + struct vm_fault vmf; + int ret; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = page; + + ret = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + if (unlikely(!(ret & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + unlock_page(page); + return 0; /* retry */ + } + ret |= VM_FAULT_LOCKED; + } else + VM_BUG_ON_PAGE(!PageLocked(page), page); + return ret; +} + +/* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. @@ -2616,6 +2048,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *dirty_page = NULL; unsigned long mmun_start = 0; /* For mmu_notifiers */ unsigned long mmun_end = 0; /* For mmu_notifiers */ + struct mem_cgroup *memcg; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2668,42 +2101,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - struct vm_fault vmf; int tmp; - - vmf.virtual_address = (void __user *)(address & - PAGE_MASK); - vmf.pgoff = old_page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.page = old_page; - - /* - * Notify the address space that the page is about to - * become writable so that it can prohibit this or wait - * for the page to get into an appropriate state. - * - * We do this without the lock held, so that it can - * sleep if it needs to. - */ page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; + tmp = do_page_mkwrite(vma, old_page, address); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(old_page); + return tmp; } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(old_page); - if (!old_page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(old_page); - goto unwritable_page; - } - } else - VM_BUG_ON_PAGE(!PageLocked(old_page), old_page); - /* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If @@ -2748,11 +2154,11 @@ reuse: * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * - * __do_fault is protected similarly. + * do_shared_fault is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + set_page_dirty_balance(dirty_page); /* file_update_time outside page_lock */ if (vma->vm_file) file_update_time(vma->vm_file); @@ -2798,7 +2204,7 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; mmun_start = address & PAGE_MASK; @@ -2828,6 +2234,8 @@ gotten: */ ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -2865,7 +2273,7 @@ gotten: new_page = old_page; ret |= VM_FAULT_WRITE; } else - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); if (new_page) page_cache_release(new_page); @@ -2892,10 +2300,6 @@ oom: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; - -unwritable_page: - page_cache_release(old_page); - return ret; } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2997,7 +2401,10 @@ EXPORT_SYMBOL(unmap_mapping_range); /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * We return with the mmap_sem locked or unlocked in the same cases + * as does filemap_fault(). */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -3005,10 +2412,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, { spinlock_t *ptl; struct page *page, *swapcache; + struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; - struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; @@ -3084,7 +2491,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page; } - if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3109,10 +2516,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * while the page is counted on swap but not yet in mapcount i.e. * before page_add_anon_rmap() and swap_free(); try_to_free_swap() * must be called after the swap_free(), or it will never succeed. - * Because delete_from_swap_page() may be called by reuse_swap_page(), - * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry - * in page->private. In this case, a record in swap_cgroup is silently - * discarded at swap_free(). */ inc_mm_counter_fast(mm, MM_ANONPAGES); @@ -3128,12 +2531,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); - if (page == swapcache) + if (page == swapcache) { do_page_add_anon_rmap(page, vma, address, exclusive); - else /* ksm created a completely new copy */ + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, address); - /* It's better to call commit-charge after rmap is established */ - mem_cgroup_commit_charge_swapin(page, ptr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) @@ -3166,7 +2571,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge_swapin(ptr); + mem_cgroup_cancel_charge(page, memcg); pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); @@ -3222,6 +2627,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags) { + struct mem_cgroup *memcg; struct page *page; spinlock_t *ptl; pte_t entry; @@ -3255,7 +2661,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -3268,6 +2674,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(mm, address, page_table, entry); @@ -3277,7 +2685,7 @@ unlock: pte_unmap_unlock(page_table, ptl); return 0; release: - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); goto unlock; oom_free_page: @@ -3287,52 +2695,15 @@ oom: } /* - * __do_fault() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid - * the next page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte neither mapped nor locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * The mmap_sem must have been held on entry, and may have been + * released depending on flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int __do_fault(struct vm_area_struct *vma, unsigned long address, + pgoff_t pgoff, unsigned int flags, struct page **page) { - pte_t *page_table; - spinlock_t *ptl; - struct page *page; - struct page *cow_page; - pte_t entry; - int anon = 0; - struct page *dirty_page = NULL; struct vm_fault vmf; int ret; - int page_mkwrite = 0; - - /* - * If we do COW later, allocate page befor taking lock_page() - * on the file cache page. This will reduce lock holding time. - */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - - cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!cow_page) - return VM_FAULT_OOM; - - if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { - page_cache_release(cow_page); - return VM_FAULT_OOM; - } - } else - cow_page = NULL; vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; @@ -3340,154 +2711,325 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | - VM_FAULT_RETRY))) - goto uncharge_out; + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - ret = VM_FAULT_HWPOISON; page_cache_release(vmf.page); - goto uncharge_out; + return VM_FAULT_HWPOISON; } - /* - * For consistency in subsequent calls, make the faulted page always - * locked. - */ if (unlikely(!(ret & VM_FAULT_LOCKED))) lock_page(vmf.page); else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + *page = vmf.page; + return ret; +} + +/** + * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * + * @vma: virtual memory area + * @address: user virtual address + * @page: page to map + * @pte: pointer to target page table entry + * @write: true, if new entry is writable + * @anon: true, if it's anonymous page + * + * Caller must hold page table lock relevant for @pte. + * + * Target users are page handler itself and implementations of + * vm_ops->map_pages. + */ +void do_set_pte(struct vm_area_struct *vma, unsigned long address, + struct page *page, pte_t *pte, bool write, bool anon) +{ + pte_t entry; + + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) + entry = pte_mksoft_dirty(entry); + if (anon) { + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + page_add_file_rmap(page); + } + set_pte_at(vma->vm_mm, address, pte, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); +} + +static unsigned long fault_around_bytes __read_mostly = + rounddown_pow_of_two(65536); + +#ifdef CONFIG_DEBUG_FS +static int fault_around_bytes_get(void *data, u64 *val) +{ + *val = fault_around_bytes; + return 0; +} + +/* + * fault_around_pages() and fault_around_mask() expects fault_around_bytes + * rounded down to nearest page order. It's what do_fault_around() expects to + * see. + */ +static int fault_around_bytes_set(void *data, u64 val) +{ + if (val / PAGE_SIZE > PTRS_PER_PTE) + return -EINVAL; + if (val > PAGE_SIZE) + fault_around_bytes = rounddown_pow_of_two(val); + else + fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, + fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); + +static int __init fault_around_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); + if (!ret) + pr_warn("Failed to create fault_around_bytes in debugfs"); + return 0; +} +late_initcall(fault_around_debugfs); +#endif + +/* + * do_fault_around() tries to map few pages around the fault address. The hope + * is that the pages will be needed soon and this will lower the number of + * faults to handle. + * + * It uses vm_ops->map_pages() to map the pages, which skips the page if it's + * not ready to be mapped: not up-to-date, locked, etc. + * + * This function is called with the page table lock taken. In the split ptlock + * case the page table lock only protects only those entries which belong to + * the page table corresponding to the fault address. + * + * This function doesn't cross the VMA boundaries, in order to call map_pages() + * only once. + * + * fault_around_pages() defines how many pages we'll try to map. + * do_fault_around() expects it to return a power of two less than or equal to + * PTRS_PER_PTE. + * + * The virtual address of the area that we map is naturally aligned to the + * fault_around_pages() value (and therefore to page order). This way it's + * easier to guarantee that we don't cross page table boundaries. + */ +static void do_fault_around(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pgoff_t pgoff, unsigned int flags) +{ + unsigned long start_addr, nr_pages, mask; + pgoff_t max_pgoff; + struct vm_fault vmf; + int off; + + nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; + mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; + + start_addr = max(address & mask, vma->vm_start); + off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + pte -= off; + pgoff -= off; + /* - * Should we do an early C-O-W break? + * max_pgoff is either end of page table or end of vma + * or fault_around_pages() from pgoff, depending what is nearest. */ - page = vmf.page; - if (flags & FAULT_FLAG_WRITE) { - if (!(vma->vm_flags & VM_SHARED)) { - page = cow_page; - anon = 1; - copy_user_highpage(page, vmf.page, address, vma); - __SetPageUptodate(page); - } else { - /* - * If the page will be shareable, see if the backing - * address space wants to know that the page is about - * to become writable - */ - if (vma->vm_ops->page_mkwrite) { - int tmp; - - unlock_page(page); - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; - } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(page); - if (!page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(page); - goto unwritable_page; - } - } else - VM_BUG_ON_PAGE(!PageLocked(page), page); - page_mkwrite = 1; - } - } - + max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + PTRS_PER_PTE - 1; + max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, + pgoff + nr_pages - 1); + + /* Check if it makes any sense to call ->map_pages */ + while (!pte_none(*pte)) { + if (++pgoff > max_pgoff) + return; + start_addr += PAGE_SIZE; + if (start_addr >= vma->vm_end) + return; + pte++; } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + vmf.virtual_address = (void __user *) start_addr; + vmf.pte = pte; + vmf.pgoff = pgoff; + vmf.max_pgoff = max_pgoff; + vmf.flags = flags; + vma->vm_ops->map_pages(vma, &vmf); +} + +static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + spinlock_t *ptl; + pte_t *pte; + int ret = 0; /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if FAULT_FLAG_WRITE is set, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. + * Let's call ->map_pages() first and use ->fault() as fallback + * if page by the offset is not ready to be mapped (cold cache or + * something). */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) - pte_mksoft_dirty(entry); - if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(page); - if (flags & FAULT_FLAG_WRITE) { - dirty_page = page; - get_page(dirty_page); - } - } - set_pte_at(mm, address, page_table, entry); + if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && + fault_around_bytes >> PAGE_SHIFT > 1) { + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + do_fault_around(vma, address, pte, pgoff, flags); + if (!pte_same(*pte, orig_pte)) + goto unlock_out; + pte_unmap_unlock(pte, ptl); + } - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, page_table); - } else { - if (cow_page) - mem_cgroup_uncharge_page(cow_page); - if (anon) - page_cache_release(page); - else - anon = 1; /* no anon but release faulted_page */ + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; } + do_set_pte(vma, address, fault_page, pte, false, false); + unlock_page(fault_page); +unlock_out: + pte_unmap_unlock(pte, ptl); + return ret; +} - pte_unmap_unlock(page_table, ptl); +static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page, *new_page; + struct mem_cgroup *memcg; + spinlock_t *ptl; + pte_t *pte; + int ret; - if (dirty_page) { - struct address_space *mapping = page->mapping; - int dirtied = 0; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; - if (set_page_dirty(dirty_page)) - dirtied = 1; - unlock_page(dirty_page); - put_page(dirty_page); - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + return VM_FAULT_OOM; - /* file_update_time outside page_lock */ - if (vma->vm_file && !page_mkwrite) - file_update_time(vma->vm_file); - } else { - unlock_page(vmf.page); - if (anon) - page_cache_release(vmf.page); + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { + page_cache_release(new_page); + return VM_FAULT_OOM; } - return ret; + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; -unwritable_page: - page_cache_release(page); + copy_user_highpage(new_page, fault_page, address, vma); + __SetPageUptodate(new_page); + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + goto uncharge_out; + } + do_set_pte(vma, address, new_page, pte, true, true); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); return ret; uncharge_out: - /* fs's fault handler get error */ - if (cow_page) { - mem_cgroup_uncharge_page(cow_page); - page_cache_release(cow_page); + mem_cgroup_cancel_charge(new_page, memcg); + page_cache_release(new_page); + return ret; +} + +static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + struct address_space *mapping; + spinlock_t *ptl; + pte_t *pte; + int dirtied = 0; + int ret, tmp; + + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + /* + * Check if the backing address space wants to know that the page is + * about to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(fault_page); + tmp = do_page_mkwrite(vma, fault_page, address); + if (unlikely(!tmp || + (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(fault_page); + return tmp; + } + } + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; + } + do_set_pte(vma, address, fault_page, pte, true, false); + pte_unmap_unlock(pte, ptl); + + if (set_page_dirty(fault_page)) + dirtied = 1; + mapping = fault_page->mapping; + unlock_page(fault_page); + if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); } + + /* file_update_time outside page_lock */ + if (vma->vm_file && !vma->vm_ops->page_mkwrite) + file_update_time(vma->vm_file); + return ret; } +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults). + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) @@ -3496,7 +3038,13 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -3506,7 +3054,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -3528,10 +3078,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } pgoff = pte_to_pgoff(orig_pte); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) { @@ -3546,7 +3102,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) { struct page *page = NULL; @@ -3632,7 +3188,10 @@ out: * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -3641,7 +3200,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte_t entry; spinlock_t *ptl; - entry = *pte; + entry = ACCESS_ONCE(*pte); if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { @@ -3692,6 +3251,9 @@ unlock: /* * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -3751,9 +3313,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } } - /* THP should already have been handled */ - BUG_ON(pmd_numa(*pmd)); - /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could @@ -3776,6 +3335,12 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { @@ -3866,44 +3431,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -#if !defined(__HAVE_ARCH_GATE_AREA) - -#if defined(AT_SYSINFO_EHDR) -static struct vm_area_struct gate_vma; - -static int __init gate_vma_init(void) -{ - gate_vma.vm_mm = NULL; - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; - - return 0; -} -__initcall(gate_vma_init); -#endif - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef AT_SYSINFO_EHDR - return &gate_vma; -#else - return NULL; -#endif -} - -int in_gate_area_no_mm(unsigned long addr) -{ -#ifdef AT_SYSINFO_EHDR - if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) - return 1; -#endif - return 0; -} - -#endif /* __HAVE_ARCH_GATE_AREA */ - static int __follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { @@ -4054,11 +3581,13 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { +#ifndef CONFIG_HAVE_IOREMAP_PROT + break; +#else /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ -#ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) break; @@ -4066,9 +3595,9 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, ret = vma->vm_ops->access(vma, addr, buf, len, write); if (ret <= 0) -#endif break; bytes = ret; +#endif } else { bytes = len; offset = addr & (PAGE_SIZE-1); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a650db29606..2ff8c2325e9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -46,19 +46,84 @@ static void generic_online_page(struct page *page); static online_page_callback_t online_page_callback = generic_online_page; +static DEFINE_MUTEX(online_page_callback_lock); -DEFINE_MUTEX(mem_hotplug_mutex); +/* The same as the cpu_hotplug lock, but for memory hotplug. */ +static struct { + struct task_struct *active_writer; + struct mutex lock; /* Synchronizes accesses to refcount, */ + /* + * Also blocks the new readers during + * an ongoing mem hotplug operation. + */ + int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} mem_hotplug = { + .active_writer = NULL, + .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), + .refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC + .dep_map = {.name = "mem_hotplug.lock" }, +#endif +}; + +/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ +#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) +#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) +#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) -void lock_memory_hotplug(void) +void get_online_mems(void) { - mutex_lock(&mem_hotplug_mutex); + might_sleep(); + if (mem_hotplug.active_writer == current) + return; + memhp_lock_acquire_read(); + mutex_lock(&mem_hotplug.lock); + mem_hotplug.refcount++; + mutex_unlock(&mem_hotplug.lock); + } -void unlock_memory_hotplug(void) +void put_online_mems(void) { - mutex_unlock(&mem_hotplug_mutex); + if (mem_hotplug.active_writer == current) + return; + mutex_lock(&mem_hotplug.lock); + + if (WARN_ON(!mem_hotplug.refcount)) + mem_hotplug.refcount++; /* try to fix things up */ + + if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) + wake_up_process(mem_hotplug.active_writer); + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); + } +static void mem_hotplug_begin(void) +{ + mem_hotplug.active_writer = current; + + memhp_lock_acquire(); + for (;;) { + mutex_lock(&mem_hotplug.lock); + if (likely(!mem_hotplug.refcount)) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&mem_hotplug.lock); + schedule(); + } +} + +static void mem_hotplug_done(void) +{ + mem_hotplug.active_writer = NULL; + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); +} /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) @@ -219,8 +284,8 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_zone_end_pfn; @@ -362,8 +427,8 @@ out_fail: return -1; } -static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); @@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; - lock_memory_hotplug(); + get_online_mems(); + mutex_lock(&online_page_callback_lock); if (online_page_callback == generic_online_page) { online_page_callback = callback; rc = 0; } - unlock_memory_hotplug(); + mutex_unlock(&online_page_callback_lock); + put_online_mems(); return rc; } @@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; - lock_memory_hotplug(); + get_online_mems(); + mutex_lock(&online_page_callback_lock); if (online_page_callback == callback) { online_page_callback = generic_online_page; rc = 0; } - unlock_memory_hotplug(); + mutex_unlock(&online_page_callback_lock); + put_online_mems(); return rc; } @@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int ret; struct memory_notify arg; - lock_memory_hotplug(); + mem_hotplug_begin(); /* * This doesn't need a lock to do pfn_to_page(). * The section can't be removed here because of the @@ -907,23 +976,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); - if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && - !can_online_high_movable(zone)) { - unlock_memory_hotplug(); - return -EINVAL; - } + ret = -EINVAL; + if ((zone_idx(zone) > ZONE_NORMAL || + online_type == MMOP_ONLINE_MOVABLE) && + !can_online_high_movable(zone)) + goto out; - if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { - if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { - unlock_memory_hotplug(); - return -EINVAL; - } + if (online_type == MMOP_ONLINE_KERNEL && + zone_idx(zone) == ZONE_MOVABLE) { + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) + goto out; } - if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { - if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { - unlock_memory_hotplug(); - return -EINVAL; - } + if (online_type == MMOP_ONLINE_MOVABLE && + zone_idx(zone) == ZONE_MOVABLE - 1) { + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) + goto out; } /* Previous code may changed the zone of the pfn range */ @@ -939,8 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); - unlock_memory_hotplug(); - return ret; + goto out; } /* * If this zone is not populated, then it is not in zonelist. @@ -964,8 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); - unlock_memory_hotplug(); - return ret; + goto out; } zone->present_pages += onlined_pages; @@ -995,9 +1060,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); - unlock_memory_hotplug(); - - return 0; +out: + mem_hotplug_done(); + return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1007,7 +1072,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) struct pglist_data *pgdat; unsigned long zones_size[MAX_NR_ZONES] = {0}; unsigned long zholes_size[MAX_NR_ZONES] = {0}; - unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { @@ -1055,7 +1120,7 @@ int try_online_node(int nid) if (node_online(nid)) return 0; - lock_memory_hotplug(); + mem_hotplug_begin(); pgdat = hotadd_new_pgdat(nid, 0); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); @@ -1073,13 +1138,13 @@ int try_online_node(int nid) } out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } static int check_hotplug_memory_range(u64 start, u64 size) { - u64 start_pfn = start >> PAGE_SHIFT; + u64 start_pfn = PFN_DOWN(start); u64 nr_pages = size >> PAGE_SHIFT; /* Memory range must be aligned with section */ @@ -1094,6 +1159,34 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } +/* + * If movable zone has already been setup, newly added memory should be check. + * If its address is higher than movable zone, it should be added as movable. + * Without this check, movable zone may overlap with other zone. + */ +static int should_add_memory_movable(int nid, u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; + + if (zone_is_empty(movable_zone)) + return 0; + + if (movable_zone->zone_start_pfn <= start_pfn) + return 1; + + return 0; +} + +int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +{ + if (should_add_memory_movable(nid, start, size)) + return ZONE_MOVABLE; + + return zone_default; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { @@ -1117,7 +1210,7 @@ int __ref add_memory(int nid, u64 start, u64 size) new_pgdat = !p; } - lock_memory_hotplug(); + mem_hotplug_begin(); new_node = !node_online(nid); if (new_node) { @@ -1158,7 +1251,7 @@ error: release_memory_resource(res); out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } EXPORT_SYMBOL_GPL(add_memory); @@ -1332,7 +1425,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * alloc_migrate_target should be improooooved!! * migrate_pages returns # of failed pages. */ - ret = migrate_pages(&source, alloc_migrate_target, 0, + ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); @@ -1565,7 +1658,7 @@ static int __ref __offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - lock_memory_hotplug(); + mem_hotplug_begin(); zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); @@ -1672,7 +1765,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - unlock_memory_hotplug(); + mem_hotplug_done(); return 0; failed_removal: @@ -1684,7 +1777,7 @@ failed_removal: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } @@ -1888,7 +1981,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) BUG_ON(check_hotplug_memory_range(start, size)); - lock_memory_hotplug(); + mem_hotplug_begin(); /* * All memory blocks must be offlined before removing memory. Check @@ -1897,10 +1990,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) */ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, check_memblock_offlined_cb); - if (ret) { - unlock_memory_hotplug(); + if (ret) BUG(); - } /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1909,7 +2000,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) try_offline_node(nid); - unlock_memory_hotplug(); + mem_hotplug_done(); } EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ae3c8f3595d..8f5330d74f4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -65,6 +65,8 @@ kernel is not always grateful with that. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/highmem.h> @@ -91,6 +93,7 @@ #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -526,9 +529,13 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, int nid; struct page *page; spinlock_t *ptl; + pte_t entry; ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); - page = pte_page(huge_ptep_get((pte_t *)pmd)); + entry = huge_ptep_get((pte_t *)pmd); + if (!pte_present(entry)) + goto unlock; + page = pte_page(entry); nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) goto unlock; @@ -649,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, * @nodes and @flags,) it's isolated and queued to the pagelist which is * passed via @private.) */ -static struct vm_area_struct * +static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) { - int err; - struct vm_area_struct *first, *vma, *prev; - + int err = 0; + struct vm_area_struct *vma, *prev; - first = find_vma(mm, start); - if (!first) - return ERR_PTR(-EFAULT); + vma = find_vma(mm, start); + if (!vma) + return -EFAULT; prev = NULL; - for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + for (; vma && vma->vm_start < end; vma = vma->vm_next) { unsigned long endvma = vma->vm_end; if (endvma > end) @@ -671,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) - return ERR_PTR(-EFAULT); + return -EFAULT; if (prev && prev->vm_end < vma->vm_start) - return ERR_PTR(-EFAULT); + return -EFAULT; } if (flags & MPOL_MF_LAZY) { @@ -687,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, err = queue_pages_pgd_range(vma, start, endvma, nodes, flags, private); - if (err) { - first = ERR_PTR(err); + if (err) break; - } } next: prev = vma; } - return first; + return err; } /* @@ -795,36 +799,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, return err; } -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); -} - /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) @@ -861,7 +835,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); @@ -1059,7 +1032,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, + err = migrate_pages(&pagelist, new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1180,16 +1153,17 @@ out: /* * Allocate a new page for page migration based on vma policy. - * Start assuming that page is mapped by vma pointed to by @private. + * Start by assuming the page is mapped by the same vma as contains @start. * Search forward from there, if not. N.B., this assumes that the * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x) { - struct vm_area_struct *vma = (struct vm_area_struct *)private; + struct vm_area_struct *vma; unsigned long uninitialized_var(address); + vma = find_vma(current->mm, start); while (vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) @@ -1219,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x) { return NULL; } @@ -1229,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { - struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; @@ -1295,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - vma = queue_pages_range(mm, start, end, nmask, + err = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - - err = PTR_ERR(vma); /* maybe ... */ - if (!IS_ERR(vma)) + if (!err) err = mbind_range(mm, start, end, new); if (!err) { @@ -1307,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, - MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + nr_failed = migrate_pages(&pagelist, new_page, NULL, + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); } @@ -1393,7 +1363,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, - unsigned long, mode, unsigned long __user *, nmask, + unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned, flags) { nodemask_t nodes; @@ -1414,7 +1384,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, } /* Set the process memory policy */ -SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { int err; @@ -1556,10 +1526,10 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, + compat_ulong_t, addr, compat_ulong_t, flags) { long err; unsigned long __user *nm = NULL; @@ -1586,8 +1556,8 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, return err; } -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode) +COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode) { long err = 0; unsigned long __user *nm = NULL; @@ -1609,9 +1579,9 @@ asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, return sys_set_mempolicy(mode, nm, nr_bits+1); } -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, + compat_ulong_t, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, compat_ulong_t, flags) { long err = 0; unsigned long __user *nm = NULL; @@ -1637,9 +1607,9 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, /* * get_vma_policy(@task, @vma, @addr) - * @task - task for fallback if vma policy == default - * @vma - virtual memory area whose policy is sought - * @addr - address in @vma for shared policy lookup + * @task: task for fallback if vma policy == default + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. @@ -1782,21 +1752,18 @@ static unsigned interleave_nodes(struct mempolicy *policy) /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. - * @policy must be protected by freeing by the caller. If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy. The system default policy requires no - * such protection. */ -unsigned slab_node(void) +unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; + int node = numa_mem_id(); if (in_interrupt()) - return numa_node_id(); + return node; policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) - return numa_node_id(); + return node; switch (policy->mode) { case MPOL_PREFERRED: @@ -1816,11 +1783,11 @@ unsigned slab_node(void) struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone ? zone->node : numa_node_id(); + return zone ? zone->node : node; } default: @@ -1888,18 +1855,18 @@ int node_random(const nodemask_t *maskp) #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) - * @vma = virtual memory area whose policy is sought - * @addr = address in @vma for shared policy lookup and interleave policy - * @gfp_flags = for requested zone - * @mpol = pointer to mempolicy pointer for reference counted mempolicy - * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * * Returns a zonelist suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * - * Must be protected by get_mems_allowed() + * Must be protected by read_mems_allowed_begin() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -2063,7 +2030,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, retry_cpuset: pol = get_vma_policy(current, vma, addr); - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -2071,7 +2038,7 @@ retry_cpuset: nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2081,7 +2048,7 @@ retry_cpuset: policy_nodemask(gfp, pol)); if (unlikely(mpol_needs_cond_ref(pol))) __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } @@ -2115,7 +2082,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) pol = &default_policy; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* * No reference counting needed for current->mempolicy @@ -2128,7 +2095,7 @@ retry_cpuset: policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2172,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) } else *new = *old; - rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); if (new->flags & MPOL_F_REBINDING) @@ -2180,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) else mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } - rcu_read_unlock(); atomic_set(&new->refcnt, 1); return new; } @@ -2301,41 +2266,12 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } -#ifdef CONFIG_NUMA_BALANCING -static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - /* Never defer a private fault */ - if (cpupid_match_pid(p, last_cpupid)) - return false; - - if (p->numa_migrate_deferred) { - p->numa_migrate_deferred--; - return true; - } - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ - p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; -} -#else -static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ -} -#endif /* CONFIG_NUMA_BALANCING */ - /** * mpol_misplaced - check whether current page node is valid in policy * - * @page - page to be checked - * @vma - vm area where page mapped - * @addr - virtual address where page mapped + * @page: page to be checked + * @vma: vm area where page mapped + * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's * node id. @@ -2403,52 +2339,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_cpupid; - int this_cpupid; - polnid = thisnid; - this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); - - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); - if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { - - /* See sysctl_numa_balancing_migrate_deferred comment */ - if (!cpupid_match_pid(current, last_cpupid)) - defer_numa_migrate(current); - goto out; - } - - /* - * The quadratic filter above reduces extraneous migration - * of shared pages somewhat. This code reduces it even more, - * reducing the overhead of page migrations of shared pages. - * This makes workloads with shared pages rely more on - * "move task near its memory", and less on "move memory - * towards its task", which is exactly what we want. - */ - if (numa_migrate_deferred(current, last_cpupid)) + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; } @@ -2751,7 +2644,7 @@ void __init numa_policy_init(void) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) - printk("numa_policy_init: interleaving failed\n"); + pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } diff --git a/mm/mempool.c b/mm/mempool.c index 659aa42bad1..e209c98c720 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/slab.h> +#include <linux/kmemleak.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/blkdev.h> @@ -192,6 +193,7 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) + * Note: using __GFP_ZERO is not supported. */ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { @@ -200,6 +202,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) wait_queue_t wait; gfp_t gfp_temp; + VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_sleep_if(gfp_mask & __GFP_WAIT); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ @@ -220,6 +223,11 @@ repeat_alloc: spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); + /* + * Update the allocation stack trace as this is more useful + * for debugging. + */ + kmemleak_update_trace(element); return element; } @@ -304,9 +312,9 @@ void mempool_free(void *element, mempool_t *pool) * ensures that there will be frees which return elements to the * pool waking up the waiters. */ - if (pool->curr_nr < pool->min_nr) { + if (unlikely(pool->curr_nr < pool->min_nr)) { spin_lock_irqsave(&pool->lock, flags); - if (pool->curr_nr < pool->min_nr) { + if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); wake_up(&pool->wait); diff --git a/mm/migrate.c b/mm/migrate.c index b494fdb9a63..2740360cd21 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; - if (pmd_trans_huge(*pmd)) - goto out; ptep = pte_offset_map(pmd, addr); @@ -148,8 +146,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (pte_swp_soft_dirty(*ptep)) pte = pte_mksoft_dirty(pte); + + /* Recheck VMA as permissions can change since migration started */ if (is_write_migration_entry(entry)) - pte = pte_mkwrite(pte); + pte = maybe_mkwrite(pte, vma); + #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); @@ -178,6 +179,37 @@ out: } /* + * Congratulations to trinity for discovering this bug. + * mm/fremap.c's remap_file_pages() accepts any range within a single vma to + * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then + * replace the specified range by file ptes throughout (maybe populated after). + * If page migration finds a page within that range, while it's still located + * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: + * zap_pte() clears the temporary migration entry before mmap_sem is dropped. + * But if the migrating page is in a part of the vma outside the range to be + * remapped, then it will not be cleared, and remove_migration_ptes() needs to + * deal with it. Fortunately, this part of the vma is of course still linear, + * so we just need to use linear location on the nonlinear list. + */ +static int remove_linear_migration_ptes_from_nonlinear(struct page *page, + struct address_space *mapping, void *arg) +{ + struct vm_area_struct *vma; + /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long addr; + + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr >= vma->vm_start && addr < vma->vm_end) + remove_migration_pte(page, vma, addr, arg); + } + return SWAP_AGAIN; +} + +/* * Get rid of all migration entries and replace them by * references to the indicated page. */ @@ -186,6 +218,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, + .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, }; rmap_walk(new, &rwc); @@ -750,6 +783,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, if (rc != MIGRATEPAGE_SUCCESS) { newpage->mapping = NULL; } else { + mem_cgroup_migrate(page, newpage, false); if (remap_swapcache) remove_migration_ptes(page, newpage); page->mapping = NULL; @@ -765,7 +799,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, { int rc = -EAGAIN; int remap_swapcache = 1; - struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { @@ -791,9 +824,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, lock_page(page); } - /* charge against new page */ - mem_cgroup_prepare_migration(page, newpage, &mem); - if (PageWriteback(page)) { /* * Only in the case of a full synchronous migration is it @@ -803,10 +833,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ if (mode != MIGRATE_SYNC) { rc = -EBUSY; - goto uncharge; + goto out_unlock; } if (!force) - goto uncharge; + goto out_unlock; wait_on_page_writeback(page); } /* @@ -842,7 +872,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ remap_swapcache = 0; } else { - goto uncharge; + goto out_unlock; } } @@ -855,7 +885,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the page migration right away (proteced by page lock). */ rc = balloon_page_migrate(newpage, page, mode); - goto uncharge; + goto out_unlock; } /* @@ -874,7 +904,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); - goto uncharge; + goto out_unlock; } goto skip_unmap; } @@ -893,10 +923,7 @@ skip_unmap: if (anon_vma) put_anon_vma(anon_vma); -uncharge: - mem_cgroup_end_migration(mem, page, newpage, - (rc == MIGRATEPAGE_SUCCESS || - rc == MIGRATEPAGE_BALLOON_SUCCESS)); +out_unlock: unlock_page(page); out: return rc; @@ -906,8 +933,9 @@ out: * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, enum migrate_mode mode) +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct page *page, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -951,11 +979,18 @@ out: page_is_file_cache(page)); putback_lru_page(page); } + /* - * Move the new page to the LRU. If migration was not successful - * then this will free the page. + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, putback_lru_page() will drop the reference grabbed + * during isolation. */ - putback_lru_page(newpage); + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { + ClearPageSwapBacked(newpage); + put_new_page(newpage, private); + } else + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -984,8 +1019,9 @@ out: * will wait in the page fault for migration to complete. */ static int unmap_and_move_huge_page(new_page_t get_new_page, - unsigned long private, struct page *hpage, - int force, enum migrate_mode mode) + free_page_t put_new_page, unsigned long private, + struct page *hpage, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -999,7 +1035,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * tables or check whether the hugepage is pmd-based or not before * kicking migration. */ - if (!hugepage_migration_support(page_hstate(hpage))) { + if (!hugepage_migration_supported(page_hstate(hpage))) { putback_active_hugepage(hpage); return -ENOSYS; } @@ -1024,20 +1060,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, 1, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) remove_migration_ptes(hpage, hpage); if (anon_vma) put_anon_vma(anon_vma); - if (!rc) + if (rc == MIGRATEPAGE_SUCCESS) hugetlb_cgroup_migrate(hpage, new_hpage); unlock_page(hpage); out: if (rc != -EAGAIN) putback_active_hugepage(hpage); - put_page(new_hpage); + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, put_page() will drop the reference grabbed during + * isolation. + */ + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(new_hpage, private); + else + put_page(new_hpage); + if (result) { if (rc) *result = rc; @@ -1054,6 +1100,8 @@ out: * @from: The list of pages to be migrated. * @get_new_page: The function used to allocate free pages to be used * as the target of the page migration. + * @put_new_page: The function used to free target pages if migration + * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for * page migration, if any. @@ -1067,7 +1115,8 @@ out: * Returns the number of pages that were not migrated, or an error code. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, - unsigned long private, enum migrate_mode mode, int reason) + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1089,10 +1138,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, mode); + put_new_page, private, page, + pass > 2, mode); else - rc = unmap_and_move(get_new_page, private, - page, pass > 2, mode); + rc = unmap_and_move(get_new_page, put_new_page, + private, page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -1241,7 +1291,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, + err = migrate_pages(&pagelist, new_page_node, NULL, (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1697,7 +1747,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); + NULL, node, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); @@ -1732,7 +1783,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; - struct mem_cgroup *memcg = NULL; int page_lru = page_is_file_cache(page); unsigned long mmun_start = address & HPAGE_PMD_MASK; unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; @@ -1798,15 +1848,6 @@ fail_putback: goto out_unlock; } - /* - * Traditional migration needs to prepare the memcg charge - * transaction early to prevent the old page from being - * uncharged when installing migration entries. Here we can - * save the potential rollback and start the charge transfer - * only when migration is already known to end successfully. - */ - mem_cgroup_prepare_migration(page, new_page, &memcg); - orig_entry = *pmd; entry = mk_pmd(new_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); @@ -1820,7 +1861,7 @@ fail_putback: * guarantee the copy is visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); - page_add_new_anon_rmap(new_page, vma, mmun_start); + page_add_anon_rmap(new_page, vma, mmun_start); pmdp_clear_flush(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); @@ -1834,17 +1875,17 @@ fail_putback: goto fail_putback; } + mem_cgroup_migrate(page, new_page, false); + page_remove_rmap(page); - /* - * Finish the charge transaction under the page table lock to - * prevent split_huge_page() from dividing up the charge - * before it's fully transferred to the new page. - */ - mem_cgroup_end_migration(memcg, page, new_page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* Take an "isolate" reference and put new page on the LRU. */ + get_page(new_page); + putback_lru_page(new_page); + unlock_page(new_page); unlock_page(page); put_page(page); /* Drop the rmap reference */ diff --git a/mm/mincore.c b/mm/mincore.c index 101623378fb..725c8096104 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_page(mapping, pgoff); #ifdef CONFIG_SWAP - /* shmem/tmpfs may return swap: account for swapcache page too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif if (page) { present = PageUptodate(page); diff --git a/mm/mlock.c b/mm/mlock.c index 4e1a6816228..ce84cb0b83e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page) */ void mlock_vma_page(struct page *page) { + /* Serialize with page migration */ BUG_ON(!PageLocked(page)); if (!TestSetPageMlocked(page)) { @@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct page *page) unsigned int nr_pages; struct zone *zone = page_zone(page); + /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); /* @@ -208,12 +210,19 @@ out: * @vma: target vma * @start: start address * @end: end address + * @nonblocking: * * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * - * vma->vm_mm->mmap_sem must be held for at least read. + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. */ long __mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking) diff --git a/mm/mmap.c b/mm/mmap.c index 20ff0c33274..c0a3637cdb6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -6,10 +6,13 @@ * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -28,6 +31,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> +#include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> @@ -36,6 +40,7 @@ #include <linux/sched/sysctl.h> #include <linux/notifier.h> #include <linux/memory.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -130,6 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed, reserve; + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + vm_acct_memory(pages); /* @@ -212,7 +221,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable--; + mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -360,20 +369,20 @@ static int browse_rb(struct rb_root *root) struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); if (vma->vm_start < prev) { - printk("vm_start %lx prev %lx\n", vma->vm_start, prev); + pr_emerg("vm_start %lx prev %lx\n", vma->vm_start, prev); bug = 1; } if (vma->vm_start < pend) { - printk("vm_start %lx pend %lx\n", vma->vm_start, pend); + pr_emerg("vm_start %lx pend %lx\n", vma->vm_start, pend); bug = 1; } if (vma->vm_start > vma->vm_end) { - printk("vm_end %lx < vm_start %lx\n", + pr_emerg("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); bug = 1; } if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - printk("free gap %lx, correct %lx\n", + pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; @@ -387,7 +396,7 @@ static int browse_rb(struct rb_root *root) for (nd = pn; nd; nd = rb_prev(nd)) j++; if (i != j) { - printk("backwards %d, forwards %d\n", j, i); + pr_emerg("backwards %d, forwards %d\n", j, i); bug = 1; } return bug ? -1 : i; @@ -405,7 +414,7 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) } } -void validate_mm(struct mm_struct *mm) +static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; @@ -422,17 +431,17 @@ void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - printk("map_count %d vm_next %d\n", mm->map_count, i); + pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); bug = 1; } if (highest_address != mm->highest_vm_end) { - printk("mm->highest_vm_end %lx, found %lx\n", + pr_emerg("mm->highest_vm_end %lx, found %lx\n", mm->highest_vm_end, highest_address); bug = 1; } i = browse_rb(&mm->mm_rb); if (i != mm->map_count) { - printk("map_count %d rb %d\n", mm->map_count, i); + pr_emerg("map_count %d rb %d\n", mm->map_count, i); bug = 1; } BUG_ON(bug); @@ -613,7 +622,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -639,11 +648,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, { struct address_space *mapping = NULL; - if (vma->vm_file) + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - - if (mapping) mutex_lock(&mapping->i_mmap_mutex); + } __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); @@ -681,8 +689,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -1299,7 +1308,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, /* * Make sure there are no mandatory locks on the file. */ - if (locks_verify_locked(inode)) + if (locks_verify_locked(file)) return -EAGAIN; vm_flags |= VM_SHARED | VM_MAYSHARE; @@ -1568,6 +1577,17 @@ munmap_back: if (error) goto free_vma; } + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto allow_write_and_free_vma; + } + + /* ->mmap() can change vma->vm_file, but must guarantee that + * vma_link() below can deny write-access if VM_DENYWRITE is set + * and map writably if VM_SHARED is set. This usually means the + * new file must not have been exposed to user-space, yet. + */ vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); if (error) @@ -1607,8 +1627,12 @@ munmap_back: vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); + if (file) { + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + } file = vma->vm_file; out: perf_event_mmap(vma); @@ -1637,14 +1661,17 @@ out: return addr; unmap_and_free_vma: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); vma->vm_file = NULL; fput(file); /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); charged = 0; + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +allow_write_and_free_vma: + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: @@ -1989,34 +2016,33 @@ EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = NULL; + struct rb_node *rb_node; + struct vm_area_struct *vma; /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node *rb_node; + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; - rb_node = mm->mm_rb.rb_node; - vma = NULL; + rb_node = mm->mm_rb.rb_node; + vma = NULL; - while (rb_node) { - struct vm_area_struct *vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; + while (rb_node) { + struct vm_area_struct *tmp; + + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (tmp->vm_end > addr) { + vma = tmp; + if (tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; } + + if (vma) + vmacache_update(addr, vma); return vma; } @@ -2388,7 +2414,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - mm->mmap_cache = NULL; /* Kill the cache. */ + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -2869,6 +2897,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) return 1; } +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ + return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static const struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + .name = special_mapping_name, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; static int special_mapping_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -2884,7 +2937,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, */ pgoff = vmf->pgoff - vma->vm_pgoff; - for (pages = vma->vm_private_data; pgoff && *pages; ++pages) + if (vma->vm_ops == &legacy_special_mapping_vmops) + pages = vma->vm_private_data; + else + pages = ((struct vm_special_mapping *)vma->vm_private_data)-> + pages; + + for (; pgoff && *pages; ++pages) pgoff--; if (*pages) { @@ -2897,37 +2956,18 @@ static int special_mapping_fault(struct vm_area_struct *vma, return VM_FAULT_SIGBUS; } -/* - * Having a close hook prevents vma merging regardless of flags. - */ -static void special_mapping_close(struct vm_area_struct *vma) -{ -} - -static const struct vm_operations_struct special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, -}; - -/* - * Called with mm->mmap_sem held for writing. - * Insert a new vma covering the given region, with the given flags. - * Its pages are supplied by the given array of struct page *. - * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. - * The region past the last page supplied will always produce SIGBUS. - * The array pointer and the pages it points to are assumed to stay alive - * for as long as this mapping might exist. - */ -int install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long vm_flags, struct page **pages) +static struct vm_area_struct *__install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_operations_struct *ops, + void *priv) { int ret; struct vm_area_struct *vma; vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (unlikely(vma == NULL)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; @@ -2937,8 +2977,8 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - vma->vm_ops = &special_mapping_vmops; - vma->vm_private_data = pages; + vma->vm_ops = ops; + vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) @@ -2948,11 +2988,40 @@ int install_special_mapping(struct mm_struct *mm, perf_event_mmap(vma); - return 0; + return vma; out: kmem_cache_free(vm_area_cachep, vma); - return ret; + return ERR_PTR(ret); +} + +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_special_mapping *spec) +{ + return __install_special_mapping(mm, addr, len, vm_flags, + &special_mapping_vmops, (void *)spec); +} + +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma = __install_special_mapping( + mm, addr, len, vm_flags, &legacy_special_mapping_vmops, + (void *)pages); + + return PTR_ERR_OR_ZERO(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); @@ -3237,7 +3306,7 @@ static struct notifier_block reserve_mem_nb = { static int __meminit init_reserve_notifier(void) { if (register_hotmemory_notifier(&reserve_mem_nb)) - printk("Failed registering memory add/remove notifier for admin reserve"); + pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; } diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 8a8cd0265e5..f802c2d216a 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -31,6 +31,9 @@ void use_mm(struct mm_struct *mm) tsk->mm = mm; switch_mm(active_mm, mm, tsk); task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif if (active_mm != mm) mmdrop(active_mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 41cefdf0aad..950813b1eb3 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -23,6 +23,25 @@ static struct srcu_struct srcu; /* + * This function allows mmu_notifier::release callback to delay a call to + * a function that will free appropriate resources. The function must be + * quick and must not block. + */ +void mmu_notifier_call_srcu(struct rcu_head *rcu, + void (*func)(struct rcu_head *rcu)) +{ + call_srcu(&srcu, rcu, func); +} +EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); + +void mmu_notifier_synchronize(void) +{ + /* Wait for any running method to finish. */ + srcu_barrier(&srcu); +} +EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); + +/* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap * runs with mm_users == 0. Other tasks may still invoke mmu notifiers @@ -53,7 +72,6 @@ void __mmu_notifier_release(struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -69,6 +87,7 @@ void __mmu_notifier_release(struct mm_struct *mm) hlist_del_init_rcu(&mn->hlist); } spin_unlock(&mm->mmu_notifier_mm->lock); + srcu_read_unlock(&srcu, id); /* * synchronize_srcu here prevents mmu_notifier_release from returning to @@ -325,6 +344,25 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); +/* + * Same as mmu_notifier_unregister but no callback and no srcu synchronization. + */ +void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + spin_lock(&mm->mmu_notifier_mm->lock); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + static int __init mmu_notifier_init(void) { return init_srcu_struct(&srcu); diff --git a/mm/mprotect.c b/mm/mprotect.c index 769a67a1580..c43d557941f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif +/* + * For a prot_numa update we only hold mmap_sem for read so there is a + * potential race with faulting where a pmd was temporarily none. This + * function checks for a transhuge pmd under the appropriate lock. It + * returns a pte if it was successfully locked or NULL if it raced with + * a transhuge insertion. + */ +static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, int prot_numa, spinlock_t **ptl) +{ + pte_t *pte; + spinlock_t *pmdl; + + /* !prot_numa is protected by mmap_sem held for write */ + if (!prot_numa) + return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + + pmdl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { + spin_unlock(pmdl); + return NULL; + } + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + spin_unlock(pmdl); + return pte; +} + static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -45,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; unsigned long pages = 0; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + if (!pte) + return 0; + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -109,15 +140,26 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; + struct mm_struct *mm = vma->vm_mm; unsigned long next; unsigned long pages = 0; unsigned long nr_huge_updates = 0; + unsigned long mni_start = 0; pmd = pmd_offset(pud, addr); do { unsigned long this_pages; next = pmd_addr_end(addr, end); + if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) + continue; + + /* invoke the mmu notifier if the pmd is populated */ + if (!mni_start) { + mni_start = addr; + mmu_notifier_invalidate_range_start(mm, mni_start, end); + } + if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); @@ -130,18 +172,21 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pages += HPAGE_PMD_NR; nr_huge_updates++; } + + /* huge pmd was handled */ continue; } } - /* fall through */ + /* fall through, the trans huge pmd just split */ } - if (pmd_none_or_clear_bad(pmd)) - continue; this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa); pages += this_pages; } while (pmd++, addr = next, addr != end); + if (mni_start) + mmu_notifier_invalidate_range_end(mm, mni_start, end); + if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; @@ -201,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { - struct mm_struct *mm = vma->vm_mm; unsigned long pages; - mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); - mmu_notifier_invalidate_range_end(mm, start, end); return pages; } diff --git a/mm/mremap.c b/mm/mremap.c index 0843feb66f3..05f1180e9f2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -194,10 +194,17 @@ unsigned long move_page_tables(struct vm_area_struct *vma, break; if (pmd_trans_huge(*old_pmd)) { int err = 0; - if (extent == HPAGE_PMD_SIZE) + if (extent == HPAGE_PMD_SIZE) { + VM_BUG_ON(vma->vm_file || !vma->anon_vma); + /* See comment in move_ptes() */ + if (need_rmap_locks) + anon_vma_lock_write(vma->anon_vma); err = move_huge_pmd(vma, new_vma, old_addr, new_addr, old_end, old_pmd, new_pmd); + if (need_rmap_locks) + anon_vma_unlock_write(vma->anon_vma); + } if (err > 0) { need_flush = true; continue; diff --git a/mm/msync.c b/mm/msync.c index 632df4527c0..992a1673d48 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) vma = find_vma(mm, start); for (;;) { struct file *file; + loff_t fstart, fend; /* Still start < end. */ error = -ENOMEM; @@ -77,12 +78,18 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out_unlock; } file = vma->vm_file; + fstart = (start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + fend = fstart + (min(end, vma->vm_end) - start) - 1; start = vma->vm_end; if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - error = vfs_fsync(file, 0); + if (vma->vm_flags & VM_NONLINEAR) + error = vfs_fsync(file, 1); + else + error = vfs_fsync_range(file, fstart, fend, 1); fput(file); if (error || start >= end) goto out; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index f73f2987a85..7c7ab32ee50 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -119,6 +119,8 @@ static unsigned long __init free_low_memory_core_early(void) phys_addr_t start, end; u64 i; + memblock_clear_hotplug(0, -1); + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); @@ -197,7 +199,6 @@ unsigned long __init free_all_bootmem(void) void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { - kmemleak_free_part(__va(physaddr), size); memblock_free(physaddr, size); } @@ -212,7 +213,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { - kmemleak_free_part(__va(addr), size); memblock_free(addr, size); } @@ -334,7 +334,7 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } -void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, +static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { diff --git a/mm/nommu.c b/mm/nommu.c index 8740213b164..a881d9673c6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -13,8 +13,11 @@ * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/export.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/file.h> @@ -24,12 +27,14 @@ #include <linux/vmalloc.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/compiler.h> #include <linux/mount.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/sched/sysctl.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -296,7 +301,7 @@ long vwrite(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; memcpy(addr, buf, count); - return(count); + return count; } /* @@ -459,7 +464,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } @@ -768,16 +773,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { + int i; struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; + struct task_struct *curr = current; kenter("%p", vma); protect_vma(vma, 0); mm->map_count--; - if (mm->mmap_cache == vma) - mm->mmap_cache = NULL; + for (i = 0; i < VMACACHE_SIZE; i++) { + /* if the vma is cached, invalidate the entire cache */ + if (curr->vmacache[i] == vma) { + vmacache_invalidate(mm); + break; + } + } /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -825,8 +837,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; /* check the cache first */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (vma && vma->vm_start <= addr && vma->vm_end > addr) + vma = vmacache_find(mm, addr); + if (likely(vma)) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -835,7 +847,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) if (vma->vm_start > addr) return NULL; if (vma->vm_end > addr) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } @@ -874,8 +886,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long end = addr + len; /* check the cache first */ - vma = mm->mmap_cache; - if (vma && vma->vm_start == addr && vma->vm_end == end) + vma = vmacache_find_exact(mm, addr, end); + if (vma) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -886,7 +898,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma->vm_start > addr) return NULL; if (vma->vm_end == end) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } @@ -995,7 +1007,7 @@ static int validate_mmap_request(struct file *file, (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file_inode(file))) + if (locks_verify_locked(file)) return -EAGAIN; if (!(capabilities & BDI_CAP_MAP_DIRECT)) @@ -1003,8 +1015,7 @@ static int validate_mmap_request(struct file *file, /* we mustn't privatise shared mappings */ capabilities &= ~BDI_CAP_MAP_COPY; - } - else { + } else { /* we're going to read the file into private memory we * allocate */ if (!(capabilities & BDI_CAP_MAP_COPY)) @@ -1035,23 +1046,20 @@ static int validate_mmap_request(struct file *file, if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (prot & PROT_EXEC) return -EPERM; - } - else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { + } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { /* handle implication of PROT_EXEC by PROT_READ */ if (current->personality & READ_IMPLIES_EXEC) { if (capabilities & BDI_CAP_EXEC_MAP) prot |= PROT_EXEC; } - } - else if ((prot & PROT_READ) && + } else if ((prot & PROT_READ) && (prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP) ) { /* backing file is not executable, try to copy */ capabilities &= ~BDI_CAP_MAP_DIRECT; } - } - else { + } else { /* anonymous mappings are always memory backed and can be * privately mapped */ @@ -1241,7 +1249,7 @@ error_free: return ret; enomem: - printk("Allocation of length %lu from process %d (%s) failed\n", + pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); show_free_areas(0); return -ENOMEM; @@ -1659,7 +1667,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { - static int limit = 0; + static int limit; if (limit < 5) { printk(KERN_WARNING "munmap of memory not mmapped by process %d" @@ -1973,17 +1981,18 @@ error: return -ENOMEM; } -int in_gate_area_no_mm(unsigned long addr) +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { + BUG(); return 0; } +EXPORT_SYMBOL(filemap_fault); -int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); - return 0; } -EXPORT_SYMBOL(filemap_fault); +EXPORT_SYMBOL(filemap_map_pages); int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, unsigned long size, pgoff_t pgoff) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3291e82d435..1e11df8fa7e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -258,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill) { - if (task->exit_state) - return OOM_SCAN_CONTINUE; if (oom_unkillable_task(task, NULL, nodemask)) return OOM_SCAN_CONTINUE; @@ -559,28 +557,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; - int ret = 1; + bool ret = true; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) if (zone_is_oom_locked(zone)) { - ret = 0; + ret = false; goto out; } - } - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { - /* - * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zonelist_oom() doesn't succeed - * when it shouldn't. - */ + /* + * Lock each zone in the zonelist under zone_scan_lock so a parallel + * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. + */ + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_set_flag(zone, ZONE_OOM_LOCKED); - } out: spin_unlock(&zone_scan_lock); @@ -592,15 +587,14 @@ out: * allocation attempts with zonelists containing them may now recall the OOM * killer, if necessary. */ -void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) +void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) zone_clear_flag(zone, ZONE_OOM_LOCKED); - } spin_unlock(&zone_scan_lock); } @@ -694,9 +688,9 @@ void pagefault_out_of_memory(void) if (mem_cgroup_oom_synchronize(true)) return; - zonelist = node_zonelist(first_online_node, GFP_KERNEL); - if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { + zonelist = node_zonelist(first_memory_node, GFP_KERNEL); + if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); - clear_zonelist_oom(zonelist, GFP_KERNEL); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7106cb1aca8..91d73ef1744 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -156,24 +156,6 @@ static unsigned long writeout_period_time = 0; #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) /* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -/* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect @@ -279,14 +261,11 @@ static unsigned long global_dirtyable_memory(void) */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { + const unsigned long available_memory = global_dirtyable_memory(); unsigned long background; unsigned long dirty; - unsigned long uninitialized_var(available_memory); struct task_struct *tsk; - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = global_dirtyable_memory(); - if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else @@ -593,14 +572,14 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ -static inline long long pos_ratio_polynom(unsigned long setpoint, +static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; - x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; @@ -842,7 +821,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, x_intercept = bdi_setpoint + span; if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), + pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), x_intercept - bdi_setpoint + 1); } else pos_ratio /= 4; @@ -1324,9 +1303,9 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); if (bdi_bg_thresh) - *bdi_bg_thresh = div_u64((u64)*bdi_thresh * - background_thresh, - dirty_thresh); + *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * + background_thresh, + dirty_thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need @@ -1562,9 +1541,9 @@ pause: bdi_start_background_writeback(bdi); } -void set_page_dirty_balance(struct page *page, int page_mkwrite) +void set_page_dirty_balance(struct page *page) { - if (set_page_dirty(page) || page_mkwrite) { + if (set_page_dirty(page)) { struct address_space *mapping = page_mapping(page); if (mapping) @@ -1623,7 +1602,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ - p = &__get_cpu_var(bdp_ratelimits); + p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; else if (unlikely(*p >= ratelimit_pages)) { @@ -1635,7 +1614,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ - p = &__get_cpu_var(dirty_throttle_leaks); + p = this_cpu_ptr(&dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); @@ -1682,7 +1661,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(ctl_table *table, int write, +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); @@ -2398,7 +2377,7 @@ int test_clear_page_writeback(struct page *page) return ret; } -int test_set_page_writeback(struct page *page) +int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); int ret; @@ -2423,9 +2402,10 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_TOWRITE); + if (!keep_write) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_TOWRITE); spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); @@ -2436,7 +2416,7 @@ int test_set_page_writeback(struct page *page) return ret; } -EXPORT_SYMBOL(test_set_page_writeback); +EXPORT_SYMBOL(__test_set_page_writeback); /* * Return true if any of the pages in the mapping are marked with the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3bac76ae4b3..eee96195802 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION (8) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -261,8 +262,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) } while (zone_span_seqretry(zone, seq)); if (ret) - pr_err("page %lu outside zone [ %lu - %lu ]\n", - pfn, start_pfn, start_pfn + sp); + pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", + pfn, zone_to_nid(zone), zone->name, + start_pfn, start_pfn + sp); return ret; } @@ -295,7 +297,8 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, char *reason, unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason, + unsigned long bad_flags) { static unsigned long resume; static unsigned long nr_shown; @@ -407,7 +410,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) return bad; } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +static inline void prep_zero_page(struct page *page, unsigned int order, + gfp_t gfp_flags) { int i; @@ -451,7 +455,7 @@ static inline void set_page_guard_flag(struct page *page) { } static inline void clear_page_guard_flag(struct page *page) { } #endif -static inline void set_page_order(struct page *page, int order) +static inline void set_page_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -502,21 +506,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * For recording page's order, we use page_private(page). */ static inline int page_is_buddy(struct page *page, struct page *buddy, - int order) + unsigned int order) { if (!pfn_valid_within(page_to_pfn(buddy))) return 0; - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + /* + * zone check is done late to avoid uselessly + * calculating zone/node ids for pages that could + * never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } return 0; @@ -548,6 +562,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, + unsigned long pfn, struct zone *zone, unsigned int order, int migratetype) { @@ -564,7 +579,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(migratetype == -1); - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); @@ -623,7 +638,7 @@ out: static inline int free_pages_check(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -665,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; int to_free = count; + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (to_free) { struct page *page; @@ -699,7 +717,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, list_del(&page->lru); mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); @@ -711,13 +729,18 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock(&zone->lock); } -static void free_one_page(struct zone *zone, struct page *page, int order, +static void free_one_page(struct zone *zone, + struct page *page, unsigned long pfn, + unsigned int order, int migratetype) { + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); - __free_one_page(page, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); @@ -754,15 +777,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; + unsigned long pfn = page_to_pfn(page); if (!free_pages_prepare(page, order)) return; + migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pageblock_migratetype(page); set_freepage_migratetype(page, migratetype); - free_one_page(page_zone(page), page, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -798,9 +822,21 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_count(p, 0); } while (++p, --i); - set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); - __free_pages(page, pageblock_order); + + if (pageblock_order >= MAX_ORDER) { + i = pageblock_nr_pages; + p = page; + do { + set_page_refcounted(p); + __free_pages(p, MAX_ORDER - 1); + p += MAX_ORDER_NR_PAGES; + } while (i -= MAX_ORDER_NR_PAGES); + } else { + set_page_refcounted(page); + __free_pages(page, pageblock_order); + } + adjust_managed_page_count(page, pageblock_nr_pages); } #endif @@ -859,7 +895,7 @@ static inline void expand(struct zone *zone, struct page *page, */ static inline int check_new_page(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -881,7 +917,7 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) { int i; @@ -930,6 +966,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); return page; } @@ -1056,7 +1093,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. + * buddy pages to CMA itself. We also ensure the freepage_migratetype + * is set to CMA so it is returned to the correct freelist in case + * the page ends up being not actually allocated from the pcp lists. */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1089,16 +1128,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; - int current_order; + unsigned int current_order; struct page *page; int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; current_order >= order; - --current_order) { + for (current_order = MAX_ORDER-1; + current_order >= order && current_order <= MAX_ORDER-1; + --current_order) { for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; @@ -1124,6 +1164,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) expand(zone, page, order, current_order, area, new_type); + /* The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages. This is OK as long as it does + * not differ for MIGRATE_CMA type. + */ + set_freepage_migratetype(page, new_type); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype, new_type); @@ -1172,9 +1218,9 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, int cold) + int migratetype, bool cold) { - int mt = migratetype, i; + int i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -1191,18 +1237,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - if (likely(cold == 0)) + if (likely(!cold)) list_add(&page->lru, list); else list_add_tail(&page->lru, list); - if (IS_ENABLED(CONFIG_CMA)) { - mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) - mt = migratetype; - } - set_freepage_migratetype(page, mt); list = &page->lru; - if (is_migrate_cma(mt)) + if (is_migrate_cma(get_freepage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1223,30 +1263,17 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; - int to_drain; - unsigned long batch; + int to_drain, batch; local_irq_save(flags); batch = ACCESS_ONCE(pcp->batch); - if (pcp->count >= batch) - to_drain = batch; - else - to_drain = pcp->count; + to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; } local_irq_restore(flags); } -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; -} -#else -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return false; -} #endif /* @@ -1335,7 +1362,7 @@ void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn; unsigned long flags; - int order, t; + unsigned int order, t; struct list_head *curr; if (zone_is_empty(zone)) @@ -1367,19 +1394,20 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page - * cold == 1 ? free a cold page : free a hot page + * cold == true ? free a cold page : free a hot page */ -void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pages_prepare(page, 0)) return; - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -1393,17 +1421,17 @@ void free_hot_cold_page(struct page *page, int cold) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype); goto out; } migratetype = MIGRATE_MOVABLE; } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (cold) - list_add_tail(&page->lru, &pcp->lists[migratetype]); - else + if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); + else + list_add_tail(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = ACCESS_ONCE(pcp->batch); @@ -1418,7 +1446,7 @@ out: /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, int cold) +void free_hot_cold_page_list(struct list_head *list, bool cold) { struct page *page, *next; @@ -1530,12 +1558,12 @@ int split_free_page(struct page *page) */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags, - int migratetype) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) { unsigned long flags; struct page *page; - int cold = !!(gfp_flags & __GFP_COLD); + bool cold = ((gfp_flags & __GFP_COLD) != 0); again: if (likely(order == 0)) { @@ -1580,15 +1608,13 @@ again: if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); + get_freepage_migratetype(page)); } - /* - * NOTE: GFP_THISNODE allocations do not partake in the kswapd - * aging protocol, so they can't be fair. - */ - if (!gfp_thisnode_allocation(gfp_flags)) - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && + !zone_is_fair_depleted(zone)) + zone_set_flag(zone, ZONE_FAIR_DEPLETED); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1685,12 +1711,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags, long free_pages) +static bool __zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags, + long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1705,7 +1731,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1720,15 +1746,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); @@ -1863,18 +1889,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); -} - -static void __paginginit init_zone_allows_reclaim(int nid) -{ - int i; - - for_each_online_node(i) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) - node_set(i, NODE_DATA(nid)->reclaim_nodes); - else - zone_reclaim_mode = 1; + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -1908,10 +1924,19 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return true; } -static inline void init_zone_allows_reclaim(int nid) +#endif /* CONFIG_NUMA */ + +static void reset_alloc_batches(struct zone *preferred_zone) { + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + } while (zone++ != preferred_zone); } -#endif /* CONFIG_NUMA */ /* * get_page_from_freelist goes through the zonelist trying to allocate @@ -1920,18 +1945,22 @@ static inline void init_zone_allows_reclaim(int nid) static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int migratetype) + struct zone *preferred_zone, int classzone_idx, int migratetype) { struct zoneref *z; struct page *page = NULL; - int classzone_idx; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; - classzone_idx = zone_idx(preferred_zone); zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1943,34 +1972,23 @@ zonelist_scan: if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - if ((alloc_flags & ALLOC_CPUSET) && + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) - goto try_this_zone; /* * Distribute pages in proportion to the individual * zone size to ensure fair page aging. The zone a * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. - * - * Try to stay in local zones in the fastpath. If - * that fails, the slowpath is entered, which will do - * another pass starting with the local zones, but - * ultimately fall back to remote zones that do not - * partake in the fairness round-robin cycle of this - * zonelist. - * - * NOTE: GFP_THISNODE allocations do not partake in - * the kswapd aging protocol, so they can't be fair. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - !gfp_thisnode_allocation(gfp_mask)) { - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) + break; + if (zone_is_fair_depleted(zone)) { + nr_fair_skipped++; continue; + } } /* * When allocating a page cache page for writing, we @@ -1998,15 +2016,19 @@ zonelist_scan: * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) - goto this_zone_full; + if (consider_zone_dirty && !zone_dirty_ok(zone)) + continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) { int ret; + /* Checked here to keep the fast path fast */ + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; + if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* @@ -2068,17 +2090,11 @@ try_this_zone: if (page) break; this_zone_full: - if (IS_ENABLED(CONFIG_NUMA)) + if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2087,8 +2103,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2197,12 +2242,12 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; - /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zonelist_oom(zonelist, gfp_mask)) { + /* Acquire the per-zone oom lock for each zone */ + if (!oom_zonelist_trylock(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } @@ -2215,7 +2260,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto out; @@ -2240,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: - clear_zonelist_oom(zonelist, gfp_mask); + oom_zonelist_unlock(zonelist, gfp_mask); return page; } @@ -2250,7 +2295,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, + int classzone_idx, int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2264,7 +2309,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration, + nodemask, mode, contended_compaction); current->flags &= ~PF_MEMALLOC; @@ -2278,7 +2323,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; compaction_defer_reset(preferred_zone, order, true); @@ -2297,7 +2342,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * As async compaction considers a subset of pageblocks, only * defer if the failure was a sync compaction failure. */ - if (sync_migration) + if (mode != MIGRATE_ASYNC) defer_compaction(preferred_zone, order); cond_resched(); @@ -2310,9 +2355,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int classzone_idx, int migratetype, + enum migrate_mode mode, bool *contended_compaction, + bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; } @@ -2351,7 +2396,7 @@ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int classzone_idx, int migratetype, unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; @@ -2369,7 +2414,8 @@ retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, + migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2392,14 +2438,14 @@ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; do { page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); @@ -2408,7 +2454,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, +static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, struct zone *preferred_zone) @@ -2416,29 +2462,15 @@ static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, struct zoneref *z; struct zone *zone; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wakeup_kswapd(zone, order, zone_idx(preferred_zone)); - /* - * Only reset the batches of zones that were actually - * considered in the fast path, we don't want to - * thrash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); - } + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); } static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - const gfp_t wait = gfp_mask & __GFP_WAIT; + const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2447,20 +2479,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); - if (!wait) { + if (atomic) { /* - * Not worth trying to allocate harder for - * __GFP_NOMEMALLOC even if it can't schedule. + * Not worth trying to allocate harder for __GFP_NOMEMALLOC even + * if it can't schedule. */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (!(gfp_mask & __GFP_NOMEMALLOC)) alloc_flags |= ALLOC_HARDER; /* - * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the + * comment for __cpuset_node_allowed_softwall(). */ alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && !in_interrupt()) @@ -2492,14 +2524,14 @@ static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - bool sync_migration = false; + enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; bool contended_compaction = false; @@ -2522,12 +2554,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (gfp_thisnode_allocation(gfp_mask)) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: - prepare_slowpath(gfp_mask, order, zonelist, - high_zoneidx, preferred_zone); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -2540,15 +2573,18 @@ restart: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) - first_zones_zonelist(zonelist, high_zoneidx, NULL, - &preferred_zone); + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + struct zoneref *preferred_zoneref; + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, + NULL, &preferred_zone); + classzone_idx = zonelist_zone_idx(preferred_zoneref); + } rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto got_pg; @@ -2563,7 +2599,7 @@ rebalance: page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2592,17 +2628,15 @@ rebalance: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, + classzone_idx, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; - sync_migration = true; /* * If compaction is deferred for high-order allocations, it is because @@ -2614,12 +2648,22 @@ rebalance: (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || + (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + classzone_idx, migratetype, + &did_some_progress); if (page) goto got_pg; @@ -2638,7 +2682,7 @@ rebalance: page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, - migratetype); + classzone_idx, migratetype); if (page) goto got_pg; @@ -2677,12 +2721,11 @@ rebalance: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, + classzone_idx, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2708,11 +2751,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; + struct zoneref *preferred_zoneref; struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; - struct mem_cgroup *memcg = NULL; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + int classzone_idx; gfp_mask &= gfp_allowed_mask; @@ -2731,22 +2775,16 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); if (!preferred_zone) goto out; + classzone_idx = zonelist_zone_idx(preferred_zoneref); #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) @@ -2755,7 +2793,7 @@ retry_cpuset: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* * Runtime PM, block IO and its error handling path @@ -2765,7 +2803,7 @@ retry_cpuset: gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); } trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -2777,11 +2815,9 @@ out: * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2816,7 +2852,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); else __free_pages_ok(page, order); } @@ -2835,27 +2871,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. - * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } @@ -2909,7 +2969,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); @@ -2917,7 +2977,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } -EXPORT_SYMBOL(alloc_pages_exact_nid); /** * free_pages_exact - release memory allocated via alloc_pages_exact() @@ -2999,7 +3058,7 @@ static inline void show_node(struct zone *zone) void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = 0; + val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3019,6 +3078,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; @@ -3045,9 +3105,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) goto out; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); out: return ret; } @@ -3200,12 +3260,12 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -3349,7 +3409,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(ctl_table *table, int write, +int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -4093,7 +4153,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, static void __meminit zone_init_free_lists(struct zone *zone) { - int order, t; + unsigned int order, t; for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; @@ -4105,7 +4165,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int __meminit zone_batchsize(struct zone *zone) +static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -4221,8 +4281,8 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) +static void pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { if (percpu_pagelist_fraction) pageset_set_high(pcp, @@ -4347,9 +4407,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - * Architectures may implement their own version but if add_active_range() - * was used and there are no special requirements, this is a convenient - * alternative */ int __meminit __early_pfn_to_nid(unsigned long pfn) { @@ -4404,10 +4461,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling memblock_free_early_nid() - * manually. + * If an architecture guarantees that all ranges registered contain no holes + * and may be freed, this this function may be used instead of calling + * memblock_free_early_nid() manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -4429,9 +4485,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * function may be used instead of calling memory_present() manually. + * If an architecture guarantees that all ranges registered contain no holes and may + * be freed, this function may be used instead of calling memory_present() manually. */ void __init sparse_memory_present_with_active_regions(int nid) { @@ -4449,7 +4504,7 @@ void __init sparse_memory_present_with_active_regions(int nid) * @end_pfn: Passed by reference. On return, it will have the node end_pfn. * * It returns the start and end page frame of a node based on information - * provided by an arch calling add_active_range(). If called for a node + * provided by memblock_set_node(). If called for a node * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ @@ -4919,7 +4974,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif @@ -5027,7 +5081,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) * find_min_pfn_with_active_regions - Find the minimum PFN registered * * It returns the minimum PFN based on information provided via - * add_active_range(). + * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) { @@ -5070,7 +5124,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); - struct memblock_type *type = &memblock.memory; + struct memblock_region *r; /* Need to find movable_zone earlier when movable_node is specified. */ find_usable_zone_for_movable(); @@ -5080,13 +5134,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) * options. */ if (movable_node_is_enabled()) { - for (i = 0; i < type->cnt; i++) { - if (!memblock_is_hotpluggable(&type->regions[i])) + for_each_memblock(memory, r) { + if (!memblock_is_hotpluggable(r)) continue; - nid = type->regions[i].nid; + nid = r->nid; - usable_startpfn = PFN_DOWN(type->regions[i].base); + usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; @@ -5248,7 +5302,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. - * Using the page ranges provided by add_active_range(), the size of each + * Using the page ranges provided by memblock_set_node(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed @@ -5532,7 +5586,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -5647,9 +5701,8 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); @@ -5771,7 +5824,7 @@ module_init(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5788,7 +5841,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, } #ifdef CONFIG_NUMA -int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5804,7 +5857,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return 0; } -int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5830,7 +5883,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -5843,27 +5896,42 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; - unsigned int cpu; + int old_percpu_pagelist_fraction; int ret; + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_fraction = percpu_pagelist_fraction; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || (ret < 0)) - return ret; + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_fraction && + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { + percpu_pagelist_fraction = old_percpu_pagelist_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) + goto out; - mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned int cpu; + for_each_possible_cpu(cpu) - pageset_set_high(per_cpu_ptr(zone->pageset, cpu), - high); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); } +out: mutex_unlock(&pcp_batch_high_lock); - return 0; + return ret; } int hashdist = HASHDIST_DEFAULT; @@ -6000,59 +6068,73 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) } /** - * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages * @page: The page within the block of interest - * @start_bitidx: The first bit of interest to retrieve - * @end_bitidx: The last bit of interest - * returns pageblock_bits flags + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags */ -unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx) +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long flags = 0; - unsigned long value = 1; + unsigned long bitidx, word_bitidx; + unsigned long word; zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (test_bit(bitidx + start_bitidx, bitmap)) - flags |= value; - - return flags; + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; } /** - * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest - * @start_bitidx: The first bit of interest - * @end_bitidx: The last bit of interest * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in */ -void set_pageblock_flags_group(struct page *page, unsigned long flags, - int start_bitidx, int end_bitidx) +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long value = 1; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (flags & value) - __set_bit(bitidx + start_bitidx, bitmap); - else - __clear_bit(bitidx + start_bitidx, bitmap); + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = ACCESS_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } } /* @@ -6212,7 +6294,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, cc->mode, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -6251,7 +6333,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; INIT_LIST_HEAD(&cc.migratepages); @@ -6406,7 +6488,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; - int order, i; + unsigned int order, i; unsigned long pfn; unsigned long flags; /* find the first valid pfn */ @@ -6458,7 +6540,7 @@ bool is_free_buddy_page(struct page *page) struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; - int order; + unsigned int order; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { @@ -6544,7 +6626,8 @@ static void dump_page_flags(unsigned long flags) printk(")\n"); } -void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", @@ -6560,8 +6643,8 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) mem_cgroup_print_bad_page(page); } -void dump_page(struct page *page, char *reason) +void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); } -EXPORT_SYMBOL_GPL(dump_page); +EXPORT_SYMBOL(dump_page); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index cfd162882c0..3708264d283 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -175,7 +175,7 @@ static void free_page_cgroup(void *addr) } } -void __free_page_cgroup(unsigned long pfn) +static void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; struct page_cgroup *base; @@ -188,9 +188,9 @@ void __free_page_cgroup(unsigned long pfn) ms->page_cgroup = NULL; } -int __meminit online_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, - int nid) +static int __meminit online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) { unsigned long start, end, pfn; int fail = 0; @@ -223,8 +223,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, return -ENOMEM; } -int __meminit offline_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, int nid) +static int __meminit offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) { unsigned long start, end, pfn; diff --git a/mm/page_io.c b/mm/page_io.c index 7c59ef68138..955db8b0d49 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -248,21 +248,34 @@ out: return ret; } +static sector_t swap_page_sector(struct page *page) +{ + return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)) { struct bio *bio; - int ret = 0, rw = WRITE; + int ret, rw = WRITE; struct swap_info_struct *sis = page_swap_info(page); if (sis->flags & SWP_FILE) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; - struct iovec iov = { - .iov_base = kmap(page), - .iov_len = PAGE_SIZE, + struct bio_vec bv = { + .bv_page = page, + .bv_len = PAGE_SIZE, + .bv_offset = 0 + }; + struct iov_iter from = { + .type = ITER_BVEC | WRITE, + .count = PAGE_SIZE, + .iov_offset = 0, + .nr_segs = 1, }; + from.bvec = &bv; /* older gcc versions are broken */ init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); @@ -270,10 +283,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, set_page_writeback(page); unlock_page(page); - ret = mapping->a_ops->direct_IO(KERNEL_WRITE, - &kiocb, &iov, - kiocb.ki_pos, 1); - kunmap(page); + ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE, + &kiocb, &from, + kiocb.ki_pos); if (ret == PAGE_SIZE) { count_vm_event(PSWPOUT); ret = 0; @@ -297,6 +309,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return ret; } + ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); + if (!ret) { + count_vm_event(PSWPOUT); + return 0; + } + + ret = 0; bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); @@ -338,6 +357,13 @@ int swap_readpage(struct page *page) return ret; } + ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); + if (!ret) { + count_vm_event(PSWPIN); + return 0; + } + + ret = 0; bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 3707c71ae4c..51108165f82 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -108,7 +108,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, int page_start, int page_end) { const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; - unsigned int cpu; + unsigned int cpu, tcpu; int i; for_each_possible_cpu(cpu) { @@ -116,14 +116,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); - if (!*pagep) { - pcpu_free_pages(chunk, pages, populated, - page_start, page_end); - return -ENOMEM; - } + if (!*pagep) + goto err; } } return 0; + +err: + while (--i >= page_start) + __free_page(pages[pcpu_page_idx(cpu, i)]); + + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + for (i = page_start; i < page_end; i++) + __free_page(pages[pcpu_page_idx(tcpu, i)]); + } + return -ENOMEM; } /** @@ -263,6 +272,7 @@ err: __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); } + pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; } diff --git a/mm/percpu.c b/mm/percpu.c index 036cfe07050..da997f9800b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -102,10 +102,11 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ - int map_used; /* # of map entries used */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ void *data; /* chunk data */ + int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ unsigned long populated[]; /* populated bitmap */ }; @@ -356,11 +357,11 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk) { int new_alloc; - if (chunk->map_alloc >= chunk->map_used + 2) + if (chunk->map_alloc >= chunk->map_used + 3) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 2) + while (new_alloc < chunk->map_used + 3) new_alloc *= 2; return new_alloc; @@ -418,48 +419,6 @@ out_unlock: } /** - * pcpu_split_block - split a map block - * @chunk: chunk of interest - * @i: index of map block to split - * @head: head size in bytes (can be 0) - * @tail: tail size in bytes (can be 0) - * - * Split the @i'th map block into two or three blocks. If @head is - * non-zero, @head bytes block is inserted before block @i moving it - * to @i+1 and reducing its size by @head bytes. - * - * If @tail is non-zero, the target block, which can be @i or @i+1 - * depending on @head, is reduced by @tail bytes and @tail byte block - * is inserted after the target block. - * - * @chunk->map must have enough free slots to accommodate the split. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_split_block(struct pcpu_chunk *chunk, int i, - int head, int tail) -{ - int nr_extra = !!head + !!tail; - - BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); - - /* insert new subblocks */ - memmove(&chunk->map[i + nr_extra], &chunk->map[i], - sizeof(chunk->map[0]) * (chunk->map_used - i)); - chunk->map_used += nr_extra; - - if (head) { - chunk->map[i + 1] = chunk->map[i] - head; - chunk->map[i++] = head; - } - if (tail) { - chunk->map[i++] -= tail; - chunk->map[i] = tail; - } -} - -/** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes @@ -483,19 +442,27 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; int i, off; + bool seen_free = false; + int *p; - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { - bool is_last = i + 1 == chunk->map_used; + for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { int head, tail; + int this_size; + + off = *p; + if (off & 1) + continue; /* extra for alignment requirement */ head = ALIGN(off, align) - off; - BUG_ON(i == 0 && head != 0); - if (chunk->map[i] < 0) - continue; - if (chunk->map[i] < head + size) { - max_contig = max(chunk->map[i], max_contig); + this_size = (p[1] & ~1) - off; + if (this_size < head + size) { + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + max_contig = max(this_size, max_contig); continue; } @@ -505,44 +472,59 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * than sizeof(int), which is very small but isn't too * uncommon for percpu allocations. */ - if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { - if (chunk->map[i - 1] > 0) - chunk->map[i - 1] += head; - else { - chunk->map[i - 1] -= head; + if (head && (head < sizeof(int) || !(p[-1] & 1))) { + *p = off += head; + if (p[-1] & 1) chunk->free_size -= head; - } - chunk->map[i] -= head; - off += head; + else + max_contig = max(*p - p[-1], max_contig); + this_size -= head; head = 0; } /* if tail is small, just keep it around */ - tail = chunk->map[i] - head - size; - if (tail < sizeof(int)) + tail = this_size - head - size; + if (tail < sizeof(int)) { tail = 0; + size = this_size - head; + } /* split if warranted */ if (head || tail) { - pcpu_split_block(chunk, i, head, tail); + int nr_extra = !!head + !!tail; + + /* insert new subblocks */ + memmove(p + nr_extra + 1, p + 1, + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + if (head) { - i++; - off += head; - max_contig = max(chunk->map[i - 1], max_contig); + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + *++p = off += head; + ++i; + max_contig = max(head, max_contig); + } + if (tail) { + p[1] = off + size; + max_contig = max(tail, max_contig); } - if (tail) - max_contig = max(chunk->map[i + 1], max_contig); } + if (!seen_free) + chunk->first_free = i + 1; + /* update hint and mark allocated */ - if (is_last) + if (i + 1 == chunk->map_used) chunk->contig_hint = max_contig; /* fully scanned */ else chunk->contig_hint = max(chunk->contig_hint, max_contig); - chunk->free_size -= chunk->map[i]; - chunk->map[i] = -chunk->map[i]; + chunk->free_size -= size; + *p |= 1; pcpu_chunk_relocate(chunk, oslot); return off; @@ -570,34 +552,50 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) { int oslot = pcpu_chunk_slot(chunk); - int i, off; - - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) - if (off == freeme) - break; + int off = 0; + unsigned i, j; + int to_free = 0; + int *p; + + freeme |= 1; /* we are searching for <given offset, in use> pair */ + + i = 0; + j = chunk->map_used; + while (i != j) { + unsigned k = (i + j) / 2; + off = chunk->map[k]; + if (off < freeme) + i = k + 1; + else if (off > freeme) + j = k; + else + i = j = k; + } BUG_ON(off != freeme); - BUG_ON(chunk->map[i] > 0); - chunk->map[i] = -chunk->map[i]; - chunk->free_size += chunk->map[i]; + if (i < chunk->first_free) + chunk->first_free = i; + + p = chunk->map + i; + *p = off &= ~1; + chunk->free_size += (p[1] & ~1) - off; + /* merge with next? */ + if (!(p[1] & 1)) + to_free++; /* merge with previous? */ - if (i > 0 && chunk->map[i - 1] >= 0) { - chunk->map[i - 1] += chunk->map[i]; - chunk->map_used--; - memmove(&chunk->map[i], &chunk->map[i + 1], - (chunk->map_used - i) * sizeof(chunk->map[0])); + if (i > 0 && !(p[-1] & 1)) { + to_free++; i--; + p--; } - /* merge with next? */ - if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { - chunk->map[i] += chunk->map[i + 1]; - chunk->map_used--; - memmove(&chunk->map[i + 1], &chunk->map[i + 2], - (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + if (to_free) { + chunk->map_used -= to_free; + memmove(p + 1, p + 1 + to_free, + (chunk->map_used - i) * sizeof(chunk->map[0])); } - chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); pcpu_chunk_relocate(chunk, oslot); } @@ -612,12 +610,14 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); if (!chunk->map) { - kfree(chunk); + pcpu_mem_free(chunk, pcpu_chunk_struct_size); return NULL; } chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = pcpu_unit_size; + chunk->map[0] = 0; + chunk->map[1] = pcpu_unit_size | 1; + chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; @@ -713,6 +713,15 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) unsigned long flags; void __percpu *ptr; + /* + * We want the lowest bit of offset available for in-use/free + * indicator, so force >= 16bit alignment and make size even. + */ + if (unlikely(align < 2)) + align = 2; + + size = ALIGN(size, 2); + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); @@ -1343,9 +1352,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -ai->static_size; + schunk->map[0] = 1; + schunk->map[1] = ai->static_size; + schunk->map_used = 1; if (schunk->free_size) - schunk->map[schunk->map_used++] = schunk->free_size; + schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size); + else + schunk->map[1] |= 1; /* init dynamic chunk if necessary */ if (dyn_size) { @@ -1358,8 +1371,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; - dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; - dchunk->map[dchunk->map_used++] = dchunk->free_size; + dchunk->map[0] = 1; + dchunk->map[1] = pcpu_reserved_chunk_limit; + dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; + dchunk->map_used = 2; } /* link the first chunk in */ @@ -1917,6 +1932,8 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); + + pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index a8b91992593..dfb79e028ec 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -195,7 +195,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t entry = *pmdp; if (pmd_numa(entry)) entry = pmd_mknonnuma(entry); - set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index fd26d043350..5077afcd9e1 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -23,129 +23,40 @@ /** * process_vm_rw_pages - read/write pages from task specified - * @task: task to read/write from - * @mm: mm for task - * @process_pages: struct pages area that can store at least - * nr_pages_to_copy struct page pointers - * @pa: address of page in task to start copying from/to + * @pages: array of pointers to pages we want to copy * @start_offset: offset in page to start copying from/to * @len: number of bytes to copy - * @lvec: iovec array specifying where to copy to/from - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @vm_write: 0 means copy from, 1 means copy to - * @nr_pages_to_copy: number of pages to copy - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success, error code otherwise */ -static int process_vm_rw_pages(struct task_struct *task, - struct mm_struct *mm, - struct page **process_pages, - unsigned long pa, - unsigned long start_offset, - unsigned long len, - const struct iovec *lvec, - unsigned long lvec_cnt, - unsigned long *lvec_current, - size_t *lvec_offset, - int vm_write, - unsigned int nr_pages_to_copy, - ssize_t *bytes_copied) +static int process_vm_rw_pages(struct page **pages, + unsigned offset, + size_t len, + struct iov_iter *iter, + int vm_write) { - int pages_pinned; - void *target_kaddr; - int pgs_copied = 0; - int j; - int ret; - ssize_t bytes_to_copy; - ssize_t rc = 0; - - *bytes_copied = 0; - - /* Get the pages we're interested in */ - down_read(&mm->mmap_sem); - pages_pinned = get_user_pages(task, mm, pa, - nr_pages_to_copy, - vm_write, 0, process_pages, NULL); - up_read(&mm->mmap_sem); - - if (pages_pinned != nr_pages_to_copy) { - rc = -EFAULT; - goto end; - } - /* Do the copy for each page */ - for (pgs_copied = 0; - (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); - pgs_copied++) { - /* Make sure we have a non zero length iovec */ - while (*lvec_current < lvec_cnt - && lvec[*lvec_current].iov_len == 0) - (*lvec_current)++; - if (*lvec_current == lvec_cnt) - break; + while (len && iov_iter_count(iter)) { + struct page *page = *pages++; + size_t copy = PAGE_SIZE - offset; + size_t copied; - /* - * Will copy smallest of: - * - bytes remaining in page - * - bytes remaining in destination iovec - */ - bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, - len - *bytes_copied); - bytes_to_copy = min_t(ssize_t, bytes_to_copy, - lvec[*lvec_current].iov_len - - *lvec_offset); - - target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; - - if (vm_write) - ret = copy_from_user(target_kaddr, - lvec[*lvec_current].iov_base - + *lvec_offset, - bytes_to_copy); - else - ret = copy_to_user(lvec[*lvec_current].iov_base - + *lvec_offset, - target_kaddr, bytes_to_copy); - kunmap(process_pages[pgs_copied]); - if (ret) { - *bytes_copied += bytes_to_copy - ret; - pgs_copied++; - rc = -EFAULT; - goto end; - } - *bytes_copied += bytes_to_copy; - *lvec_offset += bytes_to_copy; - if (*lvec_offset == lvec[*lvec_current].iov_len) { - /* - * Need to copy remaining part of page into the - * next iovec if there are any bytes left in page - */ - (*lvec_current)++; - *lvec_offset = 0; - start_offset = (start_offset + bytes_to_copy) - % PAGE_SIZE; - if (start_offset) - pgs_copied--; - } else { - start_offset = 0; - } - } + if (copy > len) + copy = len; -end: - if (vm_write) { - for (j = 0; j < pages_pinned; j++) { - if (j < pgs_copied) - set_page_dirty_lock(process_pages[j]); - put_page(process_pages[j]); + if (vm_write) { + copied = copy_page_from_iter(page, offset, copy, iter); + set_page_dirty_lock(page); + } else { + copied = copy_page_to_iter(page, offset, copy, iter); } - } else { - for (j = 0; j < pages_pinned; j++) - put_page(process_pages[j]); + len -= copied; + if (copied < copy && iov_iter_count(iter)) + return -EFAULT; + offset = 0; } - - return rc; + return 0; } /* Maximum number of pages kmalloc'd to hold struct page's during copy */ @@ -155,67 +66,60 @@ end: * process_vm_rw_single_vec - read/write pages from task specified * @addr: start memory address of target process * @len: size of area to copy to/from - * @lvec: iovec array specifying where to copy to/from locally - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @process_pages: struct pages area that can store at least * nr_pages_to_copy struct page pointers * @mm: mm for task * @task: task to read/write from * @vm_write: 0 means copy from, 1 means copy to - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success or on failure error code */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, - const struct iovec *lvec, - unsigned long lvec_cnt, - unsigned long *lvec_current, - size_t *lvec_offset, + struct iov_iter *iter, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, - int vm_write, - ssize_t *bytes_copied) + int vm_write) { unsigned long pa = addr & PAGE_MASK; unsigned long start_offset = addr - pa; unsigned long nr_pages; - ssize_t bytes_copied_loop; ssize_t rc = 0; - unsigned long nr_pages_copied = 0; - unsigned long nr_pages_to_copy; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - *bytes_copied = 0; - /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; - while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { - nr_pages_to_copy = min(nr_pages - nr_pages_copied, - max_pages_per_loop); + while (!rc && nr_pages && iov_iter_count(iter)) { + int pages = min(nr_pages, max_pages_per_loop); + size_t bytes; - rc = process_vm_rw_pages(task, mm, process_pages, pa, - start_offset, len, - lvec, lvec_cnt, - lvec_current, lvec_offset, - vm_write, nr_pages_to_copy, - &bytes_copied_loop); - start_offset = 0; - *bytes_copied += bytes_copied_loop; + /* Get the pages we're interested in */ + down_read(&mm->mmap_sem); + pages = get_user_pages(task, mm, pa, pages, + vm_write, 0, process_pages, NULL); + up_read(&mm->mmap_sem); - if (rc < 0) { - return rc; - } else { - len -= bytes_copied_loop; - nr_pages_copied += nr_pages_to_copy; - pa += nr_pages_to_copy * PAGE_SIZE; - } + if (pages <= 0) + return -EFAULT; + + bytes = pages * PAGE_SIZE - start_offset; + if (bytes > len) + bytes = len; + + rc = process_vm_rw_pages(process_pages, + start_offset, bytes, iter, + vm_write); + len -= bytes; + start_offset = 0; + nr_pages -= pages; + pa += pages * PAGE_SIZE; + while (pages) + put_page(process_pages[--pages]); } return rc; @@ -228,8 +132,7 @@ static int process_vm_rw_single_vec(unsigned long addr, /** * process_vm_rw_core - core of reading/writing pages from task specified * @pid: PID of process to read/write from/to - * @lvec: iovec array specifying where to copy to/from locally - * @liovcnt: size of lvec array + * @iter: where to copy to/from locally * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused @@ -238,8 +141,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * return less bytes than expected if an error occurs during the copying * process. */ -static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, - unsigned long liovcnt, +static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) @@ -250,13 +152,10 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, struct mm_struct *mm; unsigned long i; ssize_t rc = 0; - ssize_t bytes_copied_loop; - ssize_t bytes_copied = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; - unsigned long iov_l_curr_idx = 0; - size_t iov_l_curr_offset = 0; ssize_t iov_len; + size_t total_len = iov_iter_count(iter); /* * Work out how many pages of struct pages we're going to need @@ -310,24 +209,20 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto put_task_struct; } - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { + for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, - lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, - process_pages, mm, task, vm_write, &bytes_copied_loop); - bytes_copied += bytes_copied_loop; - if (rc != 0) { - /* If we have managed to copy any data at all then - we return the number of bytes copied. Otherwise - we return the error code */ - if (bytes_copied) - rc = bytes_copied; - goto put_mm; - } - } + iter, process_pages, mm, task, vm_write); + + /* copied = space before - space after */ + total_len -= iov_iter_count(iter); + + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (total_len) + rc = total_len; - rc = bytes_copied; -put_mm: mmput(mm); put_task_struct: @@ -363,6 +258,7 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc; if (flags != 0) @@ -378,13 +274,14 @@ static ssize_t process_vm_rw(pid_t pid, if (rc <= 0) goto free_iovecs; + iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc); + rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) @@ -412,7 +309,7 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, #ifdef CONFIG_COMPAT -asmlinkage ssize_t +static ssize_t compat_process_vm_rw(compat_pid_t pid, const struct compat_iovec __user *lvec, unsigned long liovcnt, @@ -424,6 +321,7 @@ compat_process_vm_rw(compat_pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc = -EFAULT; if (flags != 0) @@ -439,14 +337,14 @@ compat_process_vm_rw(compat_pid_t pid, &iov_l); if (rc <= 0) goto free_iovecs; + iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc); rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) @@ -456,25 +354,23 @@ free_iovecs: return rc; } -asmlinkage ssize_t -compat_sys_process_vm_readv(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); } -asmlinkage ssize_t -compat_sys_process_vm_writev(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); diff --git a/mm/readahead.c b/mm/readahead.c index 0de2360d65f..17b9172ec37 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,9 +8,7 @@ */ #include <linux/kernel.h> -#include <linux/fs.h> #include <linux/gfp.h> -#include <linux/mm.h> #include <linux/export.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -20,6 +18,8 @@ #include <linux/syscalls.h> #include <linux/file.h> +#include "internal.h" + /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -149,8 +149,7 @@ out: * * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -static int -__do_page_cache_readahead(struct address_space *mapping, struct file *filp, +int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size) { @@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); rcu_read_unlock(); - if (page) + if (page && !radix_tree_exceptional_entry(page)) continue; page = page_cache_alloc_readahead(mapping); @@ -233,28 +232,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return 0; } +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) /* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); -} - -/* - * Submit IO for the read-ahead request in file_ra_state. - */ -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) -{ - int actual; - - actual = __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); - - return actual; + return min(nr, MAX_READAHEAD); } /* @@ -341,13 +326,12 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset, unsigned long max) { pgoff_t head; rcu_read_lock(); - head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + head = page_cache_prev_hole(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -364,7 +348,7 @@ static int try_context_readahead(struct address_space *mapping, { pgoff_t size; - size = count_history_pages(mapping, ra, offset, max); + size = count_history_pages(mapping, offset, max); /* * not enough history pages: @@ -427,7 +411,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + start = page_cache_next_hole(mapping, offset + 1, max); rcu_read_unlock(); if (!start || start - offset > max) diff --git a/mm/rmap.c b/mm/rmap.c index d9d42316a99..3e8491c504f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ + might_sleep(); if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); anon_vma_unlock_write(anon_vma); @@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page) * above cannot corrupt). */ if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; + return NULL; } out: rcu_read_unlock(); @@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) } if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; - goto out; + return NULL; } /* we pinned the anon_vma, its safe to sleep */ @@ -515,11 +517,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) static inline unsigned long __vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - if (unlikely(is_vm_hugetlb_page(vma))) - pgoff = page->index << huge_page_order(page_hstate(page)); - + pgoff_t pgoff = page_to_pgoff(page); return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); } @@ -567,6 +565,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) pgd_t *pgd; pud_t *pud; pmd_t *pmd = NULL; + pmd_t pmde; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -577,7 +576,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) goto out; pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + /* + * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = ACCESS_ONCE(*pmd); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) pmd = NULL; out: return pmd; @@ -613,9 +618,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, if (!pmd) return NULL; - if (pmd_trans_huge(*pmd)) - return NULL; - pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ if (!sync && !pte_present(*pte)) { @@ -669,7 +671,7 @@ struct page_referenced_arg { /* * arg: page_referenced_arg will be passed */ -int page_referenced_one(struct page *page, struct vm_area_struct *vma, +static int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -986,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page, { int first = atomic_inc_and_test(&page->_mapcount); if (first) { + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption + * disabled. + */ if (PageTransHuge(page)) __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); @@ -1024,11 +1032,6 @@ void page_add_new_anon_rmap(struct page *page, __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); - if (!mlocked_vma_newpage(vma, page)) { - SetPageActive(page); - lru_cache_add(page); - } else - add_page_to_unevictable_list(page); } /** @@ -1077,11 +1080,15 @@ void page_remove_rmap(struct page *page) /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. + * + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. */ if (unlikely(PageHuge(page))) goto out; if (anon) { - mem_cgroup_uncharge_page(page); if (PageTransHuge(page)) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); @@ -1112,7 +1119,7 @@ out: /* * @arg: enum ttu_flags will be passed to this argument */ -int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -1135,7 +1142,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) goto out_mlock; - if (TTU_ACTION(flags) == TTU_MUNLOCK) + if (flags & TTU_MUNLOCK) goto out_unmap; } if (!(flags & TTU_IGNORE_ACCESS)) { @@ -1165,6 +1172,16 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (pte_unused(pteval)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + */ + if (PageAnon(page)) + dec_mm_counter(mm, MM_ANONPAGES); + else + dec_mm_counter(mm, MM_FILEPAGES); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; @@ -1193,7 +1210,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); + BUG_ON(!(flags & TTU_MIGRATION)); entry = make_migration_entry(page, pte_write(pteval)); } swp_pte = swp_entry_to_pte(entry); @@ -1202,7 +1219,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && - (TTU_ACTION(flags) == TTU_MIGRATION)) { + (flags & TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -1215,7 +1232,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL) + if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) mmu_notifier_invalidate_page(mm, address); out: return ret; @@ -1322,9 +1339,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, BUG_ON(!page || PageAnon(page)); if (locked_vma) { - mlock_vma_page(page); /* no-op if already mlocked */ - if (page == check_page) + if (page == check_page) { + /* we know we have check_page locked */ + mlock_vma_page(page); ret = SWAP_MLOCK; + } else if (trylock_page(page)) { + /* + * If we can lock the page, perform mlock. + * Otherwise leave the page alone, it will be + * eventually encountered again later. + */ + mlock_vma_page(page); + unlock_page(page); + } continue; /* don't unmap */ } @@ -1339,7 +1366,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (page->index != linear_page_index(vma, address)) { pte_t ptfile = pgoff_to_pte(page->index); if (pte_soft_dirty(pteval)) - pte_file_mksoft_dirty(ptfile); + ptfile = pte_file_mksoft_dirty(ptfile); set_pte_at(mm, address, pte, ptfile); } @@ -1360,8 +1387,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, } static int try_to_unmap_nonlinear(struct page *page, - struct address_space *mapping, struct vm_area_struct *vma) + struct address_space *mapping, void *arg) { + struct vm_area_struct *vma; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1491,7 +1519,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) + if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; ret = rmap_walk(page, &rwc); @@ -1543,10 +1571,9 @@ void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; + anon_vma_free(anon_vma); if (root != anon_vma && atomic_dec_and_test(&root->refcount)) anon_vma_free(root); - - anon_vma_free(anon_vma); } static struct anon_vma *rmap_walk_anon_lock(struct page *page, @@ -1588,7 +1615,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_to_pgoff(page); struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1629,7 +1656,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << compound_order(page); + pgoff_t pgoff = page_to_pgoff(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1663,7 +1690,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) if (list_empty(&mapping->i_mmap_nonlinear)) goto done; - ret = rwc->file_nonlinear(page, mapping, vma); + ret = rwc->file_nonlinear(page, mapping, rwc->arg); done: mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/shmem.c b/mm/shmem.c index 1f18c9d0d93..469f90d5605 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -66,6 +66,9 @@ static struct vfsmount *shm_mnt; #include <linux/highmem.h> #include <linux/seq_file.h> #include <linux/magic.h> +#include <linux/syscalls.h> +#include <linux/fcntl.h> +#include <uapi/linux/memfd.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -80,11 +83,12 @@ static struct vfsmount *shm_mnt; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate and shmem_writepage communicate via inode->i_private - * (with i_mutex making sure that it has only one user at a time): - * we would prefer not to enlarge the shmem inode just for that. + * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * inode->i_private (with i_mutex making sure that it has only one user at + * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { + wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ @@ -148,6 +152,19 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) vm_unacct_memory(VM_ACCT(size)); } +static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) +{ + if (!(flags & VM_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; +} + /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow huge sparse files. @@ -242,19 +259,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { void **pslot; - void *item = NULL; + void *item; VM_BUG_ON(!expected); + VM_BUG_ON(!replacement); pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (pslot) - item = radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock); + if (!pslot) + return -ENOENT; + item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - if (replacement) - radix_tree_replace_slot(pslot, replacement); - else - radix_tree_delete(&mapping->page_tree, index); + radix_tree_replace_slot(pslot, replacement); return 0; } @@ -281,7 +296,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void *expected) + pgoff_t index, void *expected) { int error; @@ -331,84 +346,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Like find_get_pages, but collecting swap entries as well as pages. - */ -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages, pgoff_t *indices) -{ - void **slot; - unsigned int ret = 0; - struct radix_tree_iter iter; - - if (!nr_pages) - return 0; - - rcu_read_lock(); -restart: - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; - /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. - */ - goto export; - } - if (!page_cache_get_speculative(page)) - goto repeat; - - /* Has the page moved? */ - if (unlikely(page != *slot)) { - page_cache_release(page); - goto repeat; - } -export: - indices[ret] = iter.index; - pages[ret] = page; - if (++ret == nr_pages) - break; - } - rcu_read_unlock(); - return ret; -} - -/* * Remove swap entry from radix tree, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { - int error; + void *old; spin_lock_irq(&mapping->tree_lock); - error = shmem_radix_tree_replace(mapping, index, radswap, NULL); + old = radix_tree_delete_item(&mapping->page_tree, index, radswap); spin_unlock_irq(&mapping->tree_lock); - if (!error) - free_swap_and_cache(radix_to_swp_entry(radswap)); - return error; -} - -/* - * Pagevec may contain swap entries, so shuffle up pages before releasing. - */ -static void shmem_deswap_pagevec(struct pagevec *pvec) -{ - int i, j; - - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) - pvec->pages[j++] = page; - } - pvec->nr = j; + if (old != radswap) + return -ENOENT; + free_swap_and_cache(radix_to_swp_entry(radswap)); + return 0; } /* @@ -429,12 +380,12 @@ void shmem_unlock_mapping(struct address_space *mapping) * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it * has finished, if it hits a row of PAGEVEC_SIZE swap entries. */ - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); if (!pvec.nr) break; index = indices[pvec.nr - 1] + 1; - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(pvec.pages, pvec.nr); pagevec_release(&pvec); cond_resched(); @@ -466,12 +417,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pagevec_init(&pvec, 0); index = start; while (index < end) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); if (!pvec.nr) break; - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -497,9 +447,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -533,23 +482,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, return; index = start; - for ( ; ; ) { + while (index < end) { cond_resched(); - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + + pvec.nr = find_get_entries(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.pages, indices); if (!pvec.nr) { - if (index == start || unfalloc) + /* If all gone or hole-punch or unfalloc, we're done */ + if (index == start || end != -1) break; + /* But if truncating, restart to make sure all gone */ index = start; continue; } - if ((index == start || unfalloc) && indices[0] >= end) { - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - break; - } - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -560,8 +506,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (radix_tree_exceptional_entry(page)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + if (shmem_free_swap(mapping, index, page)) { + /* Swap was replaced by page: retry */ + index--; + break; + } + nr_swaps_freed++; continue; } @@ -570,13 +520,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (page->mapping == mapping) { VM_BUG_ON_PAGE(PageWriteback(page), page); truncate_inode_page(mapping, page); + } else { + /* Page was replaced by swap: retry */ + unlock_page(page); + index--; + break; } } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); index++; } @@ -596,6 +550,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); int error; error = inode_change_ok(inode, attr); @@ -606,7 +561,16 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + if (newsize != oldsize) { + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } @@ -662,7 +626,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); if (index == -1) - return 0; + return -EAGAIN; /* tell shmem_unuse we found nothing */ /* * Move _head_ to start search for next from here. @@ -707,7 +671,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - GFP_NOWAIT, radswap); + radswap); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -721,7 +685,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, spin_unlock(&info->lock); swap_free(swap); } - error = 1; /* not an error, but entry was found */ } return error; } @@ -733,7 +696,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) { struct list_head *this, *next; struct shmem_inode_info *info; - int found = 0; + struct mem_cgroup *memcg; int error = 0; /* @@ -748,26 +711,32 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ + error = -EAGAIN; mutex_lock(&shmem_swaplist_mutex); list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, &page); + error = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); - if (found) + if (error != -EAGAIN) break; + /* found nothing in this: move on to search the next */ } mutex_unlock(&shmem_swaplist_mutex); - if (found < 0) - error = found; + if (error) { + if (error != -ENOMEM) + error = 0; + mem_cgroup_cancel_charge(page, memcg); + } else + mem_cgroup_commit_charge(page, memcg, true); out: unlock_page(page); page_cache_release(page); @@ -824,6 +793,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && + !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped++; @@ -870,7 +840,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); - swapcache_free(swap, NULL); + swapcache_free(swap); redirty: set_page_dirty(page); if (wbc->for_reclaim) @@ -1043,7 +1013,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_replace_page_cache(oldpage, newpage); + mem_cgroup_migrate(oldpage, newpage, false); lru_cache_add_anon(newpage); *pagep = newpage; } @@ -1070,6 +1040,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; + struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@ -1080,7 +1051,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return -EFBIG; repeat: swap.val = 0; - page = find_lock_page(mapping, index); + page = find_lock_entry(mapping, index); if (radix_tree_exceptional_entry(page)) { swap = radix_to_swp_entry(page); page = NULL; @@ -1092,6 +1063,9 @@ repeat: goto failed; } + if (page && sgp == SGP_WRITE) + mark_page_accessed(page); + /* fallocated page? */ if (page && !PageUptodate(page)) { if (sgp != SGP_READ) @@ -1145,11 +1119,10 @@ repeat: goto failed; } - error = mem_cgroup_cache_charge(page, current->mm, - gfp & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, swp_to_radix_entry(swap)); + swp_to_radix_entry(swap)); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1162,17 +1135,24 @@ repeat: * Reset swap.val? No, leave it so "failed" goes back to * "repeat": reading a hole and writing should succeed. */ - if (error) + if (error) { + mem_cgroup_cancel_charge(page, memcg); delete_from_swap_cache(page); + } } if (error) goto failed; + mem_cgroup_commit_charge(page, memcg, true); + spin_lock(&info->lock); info->swapped--; shmem_recalc_inode(inode); spin_unlock(&info->lock); + if (sgp == SGP_WRITE) + mark_page_accessed(page); + delete_from_swap_cache(page); set_page_dirty(page); swap_free(swap); @@ -1197,22 +1177,25 @@ repeat: goto decused; } - SetPageSwapBacked(page); + __SetPageSwapBacked(page); __set_page_locked(page); - error = mem_cgroup_cache_charge(page, current->mm, - gfp & GFP_RECLAIM_MASK); + if (sgp == SGP_WRITE) + __SetPageReferenced(page); + + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (error) goto decused; error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); + NULL); radix_tree_preload_end(); } if (error) { - mem_cgroup_uncharge_cache_page(page); + mem_cgroup_cancel_charge(page, memcg); goto decused; } + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_anon(page); spin_lock(&info->lock); @@ -1298,6 +1281,64 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; int ret = VM_FAULT_LOCKED; + /* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_mutex. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_mutex in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ + if (unlikely(inode->i_private)) { + struct shmem_falloc *shmem_falloc; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT(shmem_fault_wait); + + ret = VM_FAULT_NOPAGE; + if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && + !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* It's polite to up mmap_sem if we can */ + up_read(&vma->vm_mm->mmap_sem); + ret = VM_FAULT_RETRY; + } + + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + spin_unlock(&inode->i_lock); + return ret; + } + spin_unlock(&inode->i_lock); + } + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1380,6 +1421,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -1417,6 +1459,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->backing_dev_info == &shmem_backing_dev_info; +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; @@ -1433,7 +1480,17 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; + struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + /* i_mutex is held by caller */ + if (unlikely(info->seals)) { + if (info->seals & F_SEAL_WRITE) + return -EPERM; + if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) + return -EPERM; + } + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } @@ -1462,13 +1519,17 @@ shmem_write_end(struct file *file, struct address_space *mapping, return copied; } -static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) +static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct inode *inode = file_inode(filp); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; enum sgp_type sgp = SGP_READ; + int error = 0; + ssize_t retval = 0; + loff_t *ppos = &iocb->ki_pos; /* * Might this read be for a stacking filesystem? Then when reading @@ -1496,10 +1557,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ break; } - desc->error = shmem_getpage(inode, index, &page, sgp, NULL); - if (desc->error) { - if (desc->error == -EINVAL) - desc->error = 0; + error = shmem_getpage(inode, index, &page, sgp, NULL); + if (error) { + if (error == -EINVAL) + error = 0; break; } if (page) @@ -1543,61 +1604,26 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + ret = copy_page_to_iter(page, offset, nr, to); + retval += ret; offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; page_cache_release(page); - if (ret != nr || !desc->count) + if (!iov_iter_count(to)) break; - + if (ret < nr) { + error = -EFAULT; + break; + } cond_resched(); } *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - file_accessed(filp); -} - -static ssize_t shmem_file_aio_read(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; - loff_t *ppos = &iocb->ki_pos; - - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; - - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_shmem_file_read(filp, ppos, &desc, file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } - return retval; + file_accessed(file); + return retval ? retval : error; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, @@ -1636,7 +1662,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, pipe->buffers); + nr_pages = min(req_pages, spd.nr_pages_max); spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); @@ -1729,7 +1755,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pagevec_init(&pvec, 0); pvec.nr = 1; /* start small: we may be there already */ while (!done) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr = find_get_entries(mapping, index, pvec.nr, pvec.pages, indices); if (!pvec.nr) { if (whence == SEEK_DATA) @@ -1756,7 +1782,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, break; } } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); pvec.nr = PAGEVEC_SIZE; cond_resched(); @@ -1802,27 +1828,271 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return offset; } +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on shmem. + */ +#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void shmem_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + } else if (page_count(page) - page_mapcount(page) > 1) { + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_set(&mapping->page_tree, iter.index, + SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); + } + + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int shmem_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + int error, scan; + + shmem_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + start, SHMEM_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(&mapping->page_tree, + iter.index, SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); +continue_resched: + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); + } + + return error; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +int shmem_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a shmem-file but restrict + * access to a specific subset of file operations. Seals can only be + * added, but never removed. This way, mutually untrusted parties can + * share common memory regions with a well-defined policy. A malicious + * peer can thus never perform unwanted operations on a shared object. + * + * Seals are only supported on special shmem-files and always affect + * the whole underlying inode. Once a seal is set, it may prevent some + * kinds of access to the file. Currently, the following seals are + * defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous shmem files support sealing. More importantly, seals are + * never written to disk. Therefore, there's no plan to support it on + * other file types. + */ + + if (file->f_op != &shmem_file_operations) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + mutex_lock(&inode->i_mutex); + + if (info->seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = shmem_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + info->seals |= seals; + error = 0; + +unlock: + mutex_unlock(&inode->i_mutex); + return error; +} +EXPORT_SYMBOL_GPL(shmem_add_seals); + +int shmem_get_seals(struct file *file) +{ + if (file->f_op != &shmem_file_operations) + return -EINVAL; + + return SHMEM_I(file_inode(file))->seals; +} +EXPORT_SYMBOL_GPL(shmem_get_seals); + +long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = shmem_add_seals(file, arg); + break; + case F_GET_SEALS: + error = shmem_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; pgoff_t start, index, end; int error; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + + /* protected by i_mutex */ + if (info->seals & F_SEAL_WRITE) { + error = -EPERM; + goto out; + } + + shmem_falloc.waitq = &shmem_falloc_waitq; + shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ + + spin_lock(&inode->i_lock); + inode->i_private = NULL; + wake_up_all(&shmem_falloc_waitq); + spin_unlock(&inode->i_lock); error = 0; goto out; } @@ -1832,6 +2102,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (error) goto out; + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + start = offset >> PAGE_CACHE_SHIFT; end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ @@ -1840,6 +2115,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto out; } + shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; @@ -2047,24 +2323,54 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) return shmem_unlink(dir, dentry); } +static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +{ + bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode); + + if (old_dir != new_dir && old_is_dir != new_is_dir) { + if (old_is_dir) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } else { + drop_nlink(new_dir); + inc_nlink(old_dir); + } + } + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + old_dentry->d_inode->i_ctime = + new_dentry->d_inode->i_ctime = CURRENT_TIME; + + return 0; +} + /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = old_dentry->d_inode; int they_are_dirs = S_ISDIR(inode->i_mode); + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); + if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (new_dentry->d_inode) { (void) shmem_unlink(new_dir, new_dentry); - if (they_are_dirs) + if (they_are_dirs) { + drop_nlink(new_dentry->d_inode); drop_nlink(old_dir); + } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); @@ -2566,6 +2872,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) shmem_show_mpol(seq, sbinfo->mpol); return 0; } + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct shmem_inode_info *info; + struct file *file; + int fd, error; + char *name; + long len; + + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + info = SHMEM_I(file_inode(file)); + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + if (flags & MFD_ALLOW_SEALING) + info->seals &= ~F_SEAL_SEAL; + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} + #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) @@ -2708,13 +3085,13 @@ static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = shmem_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = shmem_file_read_iter, + .write_iter = generic_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif }; @@ -2740,7 +3117,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, - .rename = shmem_rename, + .rename2 = shmem_rename2, .tmpfile = shmem_tmpfile, #endif #ifdef CONFIG_TMPFS_XATTR @@ -2783,6 +3160,7 @@ static const struct super_operations shmem_ops = { static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, + .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, @@ -2930,16 +3308,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, this.len = strlen(name); this.hash = 0; /* will go */ sb = shm_mnt->mnt_sb; + path.mnt = mntget(shm_mnt); path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(shm_mnt); res = ERR_PTR(-ENOSPC); inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) - goto put_dentry; + goto put_memory; inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); @@ -2947,19 +3325,19 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (IS_ERR(res)) - goto put_dentry; + goto put_path; res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); if (IS_ERR(res)) - goto put_dentry; + goto put_path; return res; -put_dentry: - path_put(&path); put_memory: shmem_unacct_size(flags, size); +put_path: + path_put(&path); return res; } diff --git a/mm/slab.c b/mm/slab.c index b264214c77e..7c52b3890d2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -157,6 +157,17 @@ #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif +#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ + <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) + +#if FREELIST_BYTE_INDEX +typedef unsigned char freelist_idx_t; +#else +typedef unsigned short freelist_idx_t; +#endif + +#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) + /* * true if a page was allocated from pfmemalloc reserves for network-based * swap @@ -180,7 +191,6 @@ struct array_cache { unsigned int limit; unsigned int batchcount; unsigned int touched; - spinlock_t lock; void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing @@ -192,6 +202,11 @@ struct array_cache { */ }; +struct alien_cache { + spinlock_t lock; + struct array_cache ac; +}; + #define SLAB_OBJ_PFMEMALLOC 1 static inline bool is_obj_pfmemalloc(void *objp) { @@ -231,7 +246,8 @@ static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; static int drain_freelist(struct kmem_cache *cache, struct kmem_cache_node *n, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, struct list_head *list); +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); @@ -256,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define MAKE_LIST(cachep, listp, slab, nodeid) \ do { \ INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->node[nodeid]->slab), listp); \ + list_splice(&get_node(cachep, nodeid)->slab, listp); \ } while (0) #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ @@ -277,8 +293,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. */ -#define REAPTIMEOUT_CPUC (2*HZ) -#define REAPTIMEOUT_LIST3 (4*HZ) +#define REAPTIMEOUT_AC (2*HZ) +#define REAPTIMEOUT_NODE (4*HZ) #if STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) @@ -375,6 +391,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif +#define OBJECT_FREE (0) +#define OBJECT_ACTIVE (1) + +#ifdef CONFIG_DEBUG_SLAB_LEAK + +static void set_obj_status(struct page *page, int idx, int val) +{ + int freelist_size; + char *status; + struct kmem_cache *cachep = page->slab_cache; + + freelist_size = cachep->num * sizeof(freelist_idx_t); + status = (char *)page->freelist + freelist_size; + status[idx] = val; +} + +static inline unsigned int get_obj_status(struct page *page, int idx) +{ + int freelist_size; + char *status; + struct kmem_cache *cachep = page->slab_cache; + + freelist_size = cachep->num * sizeof(freelist_idx_t); + status = (char *)page->freelist + freelist_size; + + return status[idx]; +} + +#else +static inline void set_obj_status(struct page *page, int idx, int val) {} + +#endif + /* * Do not go above this order unless 0 objects fit into the slab or * overridden on the command line. @@ -423,151 +472,57 @@ static struct kmem_cache kmem_cache_boot = { #define BAD_ALIEN_MAGIC 0x01020304ul -#ifdef CONFIG_LOCKDEP - -/* - * Slab sometimes uses the kmalloc slabs to store the slab headers - * for other slabs "off slab". - * The locking for this is tricky in that it nests within the locks - * of all other slabs in a few places; to deal with this special - * locking we put on-slab caches into a separate lock-class. - * - * We set lock class for alien array caches which are up during init. - * The lock annotation will be lost if all cpus of a node goes down and - * then comes back up during hotplug - */ -static struct lock_class_key on_slab_l3_key; -static struct lock_class_key on_slab_alc_key; - -static struct lock_class_key debugobj_l3_key; -static struct lock_class_key debugobj_alc_key; - -static void slab_set_lock_classes(struct kmem_cache *cachep, - struct lock_class_key *l3_key, struct lock_class_key *alc_key, - int q) -{ - struct array_cache **alc; - struct kmem_cache_node *n; - int r; - - n = cachep->node[q]; - if (!n) - return; - - lockdep_set_class(&n->list_lock, l3_key); - alc = n->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - return; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, alc_key); - } -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) -{ - slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ - int node; - - for_each_online_node(node) - slab_set_debugobj_lock_classes_node(cachep, node); -} - -static void init_node_lock_keys(int q) -{ - int i; - - if (slab_state < UP) - return; - - for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache_node *n; - struct kmem_cache *cache = kmalloc_caches[i]; - - if (!cache) - continue; - - n = cache->node[q]; - if (!n || OFF_SLAB(cache)) - continue; - - slab_set_lock_classes(cache, &on_slab_l3_key, - &on_slab_alc_key, q); - } -} - -static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) -{ - if (!cachep->node[q]) - return; - - slab_set_lock_classes(cachep, &on_slab_l3_key, - &on_slab_alc_key, q); -} +static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); -static inline void on_slab_lock_classes(struct kmem_cache *cachep) +static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { - int node; - - VM_BUG_ON(OFF_SLAB(cachep)); - for_each_node(node) - on_slab_lock_classes_node(cachep, node); + return cachep->array[smp_processor_id()]; } -static inline void init_lock_keys(void) +static size_t calculate_freelist_size(int nr_objs, size_t align) { - int node; - - for_each_node(node) - init_node_lock_keys(node); -} -#else -static void init_node_lock_keys(int q) -{ -} + size_t freelist_size; -static inline void init_lock_keys(void) -{ -} + freelist_size = nr_objs * sizeof(freelist_idx_t); + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + freelist_size += nr_objs * sizeof(char); -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ -} + if (align) + freelist_size = ALIGN(freelist_size, align); -static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) -{ + return freelist_size; } -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +static int calculate_nr_objs(size_t slab_size, size_t buffer_size, + size_t idx_size, size_t align) { -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ -} -#endif + int nr_objs; + size_t remained_size; + size_t freelist_size; + int extra_space = 0; -static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + extra_space = sizeof(char); + /* + * Ignore padding for the initial guess. The padding + * is at most @align-1 bytes, and @buffer_size is at + * least @align. In the worst case, this result will + * be one greater than the number of objects that fit + * into the memory allocation when taking the padding + * into account. + */ + nr_objs = slab_size / (buffer_size + idx_size + extra_space); -static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) -{ - return cachep->array[smp_processor_id()]; -} + /* + * This calculated number will be either the right + * amount, or one greater than what we want. + */ + remained_size = slab_size - nr_objs * buffer_size; + freelist_size = calculate_freelist_size(nr_objs, align); + if (remained_size < freelist_size) + nr_objs--; -static size_t slab_mgmt_size(size_t nr_objs, size_t align) -{ - return ALIGN(nr_objs * sizeof(unsigned int), align); + return nr_objs; } /* @@ -600,25 +555,9 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, nr_objs = slab_size / buffer_size; } else { - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int)); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size - > slab_size) - nr_objs--; - - mgmt_size = slab_mgmt_size(nr_objs, align); + nr_objs = calculate_nr_objs(slab_size, buffer_size, + sizeof(freelist_idx_t), align); + mgmt_size = calculate_freelist_size(nr_objs, align); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; @@ -723,13 +662,8 @@ static void start_cpu_timer(int cpu) } } -static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount, gfp_t gfp) +static void init_arraycache(struct array_cache *ac, int limit, int batch) { - int memsize = sizeof(void *) * entries + sizeof(struct array_cache); - struct array_cache *nc = NULL; - - nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. * However, when such objects are allocated or transferred to another @@ -737,15 +671,24 @@ static struct array_cache *alloc_arraycache(int node, int entries, * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. */ - kmemleak_no_scan(nc); - if (nc) { - nc->avail = 0; - nc->limit = entries; - nc->batchcount = batchcount; - nc->touched = 0; - spin_lock_init(&nc->lock); + kmemleak_no_scan(ac); + if (ac) { + ac->avail = 0; + ac->limit = limit; + ac->batchcount = batch; + ac->touched = 0; } - return nc; +} + +static struct array_cache *alloc_arraycache(int node, int entries, + int batchcount, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); + struct array_cache *ac = NULL; + + ac = kmalloc_node(memsize, gfp, node); + init_arraycache(ac, entries, batchcount); + return ac; } static inline bool is_slab_pfmemalloc(struct page *page) @@ -757,7 +700,7 @@ static inline bool is_slab_pfmemalloc(struct page *page) static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { - struct kmem_cache_node *n = cachep->node[numa_mem_id()]; + struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); struct page *page; unsigned long flags; @@ -812,7 +755,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, * If there are empty slabs on the slabs_free list and we are * being forced to refill the cache, mark this one !pfmemalloc. */ - n = cachep->node[numa_mem_id()]; + n = get_node(cachep, numa_mem_id()); if (!list_empty(&n->slabs_free) && force_refill) { struct page *page = virt_to_head_page(objp); ClearPageSlabPfmemalloc(page); @@ -892,12 +835,13 @@ static int transfer_objects(struct array_cache *to, #define drain_alien_cache(cachep, alien) do { } while (0) #define reap_alien(cachep, n) do { } while (0) -static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static inline struct alien_cache **alloc_alien_cache(int node, + int limit, gfp_t gfp) { - return (struct array_cache **)BAD_ALIEN_MAGIC; + return (struct alien_cache **)BAD_ALIEN_MAGIC; } -static inline void free_alien_cache(struct array_cache **ac_ptr) +static inline void free_alien_cache(struct alien_cache **ac_ptr) { } @@ -923,46 +867,60 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); -static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static struct alien_cache *__alloc_alien_cache(int node, int entries, + int batch, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + struct alien_cache *alc = NULL; + + alc = kmalloc_node(memsize, gfp, node); + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + return alc; +} + +static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - struct array_cache **ac_ptr; - int memsize = sizeof(void *) * nr_node_ids; + struct alien_cache **alc_ptr; + size_t memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) limit = 12; - ac_ptr = kzalloc_node(memsize, gfp, node); - if (ac_ptr) { - for_each_node(i) { - if (i == node || !node_online(i)) - continue; - ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); - if (!ac_ptr[i]) { - for (i--; i >= 0; i--) - kfree(ac_ptr[i]); - kfree(ac_ptr); - return NULL; - } + alc_ptr = kzalloc_node(memsize, gfp, node); + if (!alc_ptr) + return NULL; + + for_each_node(i) { + if (i == node || !node_online(i)) + continue; + alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); + if (!alc_ptr[i]) { + for (i--; i >= 0; i--) + kfree(alc_ptr[i]); + kfree(alc_ptr); + return NULL; } } - return ac_ptr; + return alc_ptr; } -static void free_alien_cache(struct array_cache **ac_ptr) +static void free_alien_cache(struct alien_cache **alc_ptr) { int i; - if (!ac_ptr) + if (!alc_ptr) return; for_each_node(i) - kfree(ac_ptr[i]); - kfree(ac_ptr); + kfree(alc_ptr[i]); + kfree(alc_ptr); } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + struct list_head *list) { - struct kmem_cache_node *n = cachep->node[node]; + struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { spin_lock(&n->list_lock); @@ -974,7 +932,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (n->shared) transfer_objects(n->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; spin_unlock(&n->list_lock); } @@ -988,28 +946,40 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) int node = __this_cpu_read(slab_reap_node); if (n->alien) { - struct array_cache *ac = n->alien[node]; + struct alien_cache *alc = n->alien[node]; + struct array_cache *ac; + + if (alc) { + ac = &alc->ac; + if (ac->avail && spin_trylock_irq(&alc->lock)) { + LIST_HEAD(list); - if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + __drain_alien_cache(cachep, ac, node, &list); + spin_unlock_irq(&alc->lock); + slabs_destroy(cachep, &list); + } } } } static void drain_alien_cache(struct kmem_cache *cachep, - struct array_cache **alien) + struct alien_cache **alien) { int i = 0; + struct alien_cache *alc; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { - ac = alien[i]; - if (ac) { - spin_lock_irqsave(&ac->lock, flags); - __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + alc = alien[i]; + if (alc) { + LIST_HEAD(list); + + ac = &alc->ac; + spin_lock_irqsave(&alc->lock, flags); + __drain_alien_cache(cachep, ac, i, &list); + spin_unlock_irqrestore(&alc->lock, flags); + slabs_destroy(cachep, &list); } } } @@ -1018,8 +988,10 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; - struct array_cache *alien = NULL; + struct alien_cache *alien = NULL; + struct array_cache *ac; int node; + LIST_HEAD(list); node = numa_mem_id(); @@ -1030,21 +1002,25 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) if (likely(nodeid == node)) return 0; - n = cachep->node[node]; + n = get_node(cachep, node); STATS_INC_NODEFREES(cachep); if (n->alien && n->alien[nodeid]) { alien = n->alien[nodeid]; + ac = &alien->ac; spin_lock(&alien->lock); - if (unlikely(alien->avail == alien->limit)) { + if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, ac, nodeid, &list); } - ac_put_obj(cachep, alien, objp); + ac_put_obj(cachep, ac, objp); spin_unlock(&alien->lock); + slabs_destroy(cachep, &list); } else { - spin_lock(&(cachep->node[nodeid])->list_lock); - free_block(cachep, &objp, 1, nodeid); - spin_unlock(&(cachep->node[nodeid])->list_lock); + n = get_node(cachep, nodeid); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, nodeid, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); } return 1; } @@ -1063,35 +1039,36 @@ static int init_cache_node_node(int node) { struct kmem_cache *cachep; struct kmem_cache_node *n; - const int memsize = sizeof(struct kmem_cache_node); + const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { /* - * Set up the size64 kmemlist for cpu before we can + * Set up the kmem_cache_node for cpu before we can * begin anything. Make sure some other cpu on this * node has not already allocated this */ - if (!cachep->node[node]) { + n = get_node(cachep, node); + if (!n) { n = kmalloc_node(memsize, GFP_KERNEL, node); if (!n) return -ENOMEM; kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; /* - * The l3s don't come and go as CPUs come and - * go. slab_mutex is sufficient + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient * protection here. */ cachep->node[node] = n; } - spin_lock_irq(&cachep->node[node]->list_lock); - cachep->node[node]->free_limit = + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->node[node]->list_lock); + spin_unlock_irq(&n->list_lock); } return 0; } @@ -1112,12 +1089,13 @@ static void cpuup_canceled(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared; - struct array_cache **alien; + struct alien_cache **alien; + LIST_HEAD(list); /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; cachep->array[cpu] = NULL; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) goto free_array_cache; @@ -1127,7 +1105,7 @@ static void cpuup_canceled(long cpu) /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, &list); if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); @@ -1137,7 +1115,7 @@ static void cpuup_canceled(long cpu) shared = n->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = NULL; } @@ -1152,6 +1130,7 @@ static void cpuup_canceled(long cpu) free_alien_cache(alien); } free_array_cache: + slabs_destroy(cachep, &list); kfree(nc); } /* @@ -1160,7 +1139,7 @@ free_array_cache: * shrink each nodelist to its limit. */ list_for_each_entry(cachep, &slab_caches, list) { - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; drain_freelist(cachep, n, slabs_tofree(cachep, n)); @@ -1191,7 +1170,7 @@ static int cpuup_prepare(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared = NULL; - struct array_cache **alien = NULL; + struct alien_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, cachep->batchcount, GFP_KERNEL); @@ -1215,7 +1194,7 @@ static int cpuup_prepare(long cpu) } } cachep->array[cpu] = nc; - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(!n); spin_lock_irq(&n->list_lock); @@ -1236,13 +1215,7 @@ static int cpuup_prepare(long cpu) spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(alien); - if (cachep->flags & SLAB_DEBUG_OBJECTS) - slab_set_debugobj_lock_classes_node(cachep, node); - else if (!OFF_SLAB(cachep) && - !(cachep->flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes_node(cachep, node); } - init_node_lock_keys(node); return 0; bad: @@ -1326,7 +1299,7 @@ static int __meminit drain_cache_node_node(int node) list_for_each_entry(cachep, &slab_caches, list) { struct kmem_cache_node *n; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; @@ -1406,8 +1379,8 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) for_each_online_node(node) { cachep->node[node] = &init_kmem_cache_node[index + node]; cachep->node[node]->next_reap = jiffies + - REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; } } @@ -1506,10 +1479,6 @@ void __init kmem_cache_init(void) memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmem_cache->array[smp_processor_id()] = ptr; @@ -1519,10 +1488,6 @@ void __init kmem_cache_init(void) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; } @@ -1559,9 +1524,6 @@ void __init kmem_cache_init_late(void) BUG(); mutex_unlock(&slab_mutex); - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* Done! */ slab_state = FULL; @@ -1604,10 +1566,16 @@ __initcall(cpucache_init); static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { +#if DEBUG struct kmem_cache_node *n; struct page *page; unsigned long flags; int node; + static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) + return; printk(KERN_WARNING "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", @@ -1615,14 +1583,10 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); - for_each_online_node(node) { + for_each_kmem_cache_node(cachep, node, n) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; - n = cachep->node[node]; - if (!n) - continue; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; @@ -1645,10 +1609,12 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } +#endif } /* - * Interface to system's page allocator. No need to hold the cache-lock. + * Interface to system's page allocator. No need to hold the + * kmem_cache_node ->list_lock. * * If we requested dmaable memory, we will get it. Even if we * did not request dmaable memory, we might get it, but that @@ -1664,10 +1630,13 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; + if (memcg_charge_slab(cachep, flags, cachep->gfporder)) + return NULL; + page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { - if (!(flags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(cachep, flags, nodeid); + memcg_uncharge_slab(cachep, cachep->gfporder); + slab_out_of_memory(cachep, flags, nodeid); return NULL; } @@ -1685,7 +1654,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, __SetPageSlab(page); if (page->pfmemalloc) SetPageSlabPfmemalloc(page); - memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1721,10 +1689,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) page_mapcount_reset(page); page->mapping = NULL; - memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_memcg_kmem_pages(page, cachep->gfporder); + __free_pages(page, cachep->gfporder); + memcg_uncharge_slab(cachep, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) @@ -1948,9 +1916,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, * @cachep: cache pointer being destroyed * @page: page pointer being destroyed * - * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. The - * cache-lock is not held/needed. + * Destroy all the objs in a slab page, and release the mem back to the system. + * Before calling the slab page must have been unlinked from the cache. The + * kmem_cache_node ->list_lock is not held/needed. */ static void slab_destroy(struct kmem_cache *cachep, struct page *page) { @@ -1982,6 +1950,16 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) kmem_cache_free(cachep->freelist_cache, freelist); } +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) +{ + struct page *page, *n; + + list_for_each_entry_safe(page, n, list, lru) { + list_del(&page->lru); + slab_destroy(cachep, page); + } +} + /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2010,14 +1988,21 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, if (!num) continue; + /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ + if (num > SLAB_OBJ_MAX_NUM) + break; + if (flags & CFLGS_OFF_SLAB) { + size_t freelist_size_per_obj = sizeof(freelist_idx_t); /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + freelist_size_per_obj += sizeof(char); offslab_limit = size; - offslab_limit /= sizeof(unsigned int); + offslab_limit /= freelist_size_per_obj; if (num > offslab_limit) break; @@ -2103,8 +2088,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } } cachep->node[numa_mem_id()]->next_reap = - jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; cpu_cache_get(cachep)->avail = 0; cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; @@ -2139,7 +2124,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size, ralign; + size_t left_over, freelist_size; + size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; size_t size = cachep->size; @@ -2172,14 +2158,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size &= ~(BYTES_PER_WORD - 1); } - /* - * Redzoning and user store require word alignment or possibly larger. - * Note this will be overridden by architecture or caller mandated - * alignment if either is greater than BYTES_PER_WORD. - */ - if (flags & SLAB_STORE_USER) - ralign = BYTES_PER_WORD; - if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably @@ -2243,7 +2221,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && + if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj @@ -2252,14 +2230,19 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) flags |= CFLGS_OFF_SLAB; size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); left_over = calculate_slab_order(cachep, size, cachep->align, flags); if (!cachep->num) return -E2BIG; - freelist_size = - ALIGN(cachep->num * sizeof(unsigned int), cachep->align); + freelist_size = calculate_freelist_size(cachep->num, cachep->align); /* * If the slab has been placed off-slab, and we have enough space then @@ -2272,7 +2255,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - freelist_size = cachep->num * sizeof(unsigned int); + freelist_size = calculate_freelist_size(cachep->num, 0); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2300,10 +2283,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* - * This is a possibility for one of the malloc_sizes caches. + * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than - * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, - * this should not happen at all. + * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created + * in ascending order,this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); @@ -2315,17 +2298,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) return err; } - if (flags & SLAB_DEBUG_OBJECTS) { - /* - * Would deadlock through slab_destroy()->call_rcu()-> - * debug_object_activate()->kmem_cache_alloc(). - */ - WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); - - slab_set_debugobj_lock_classes(cachep); - } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes(cachep); - return 0; } @@ -2344,7 +2316,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); + assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2352,7 +2324,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[node]->list_lock); + assert_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2372,12 +2344,16 @@ static void do_drain(void *arg) struct kmem_cache *cachep = arg; struct array_cache *ac; int node = numa_mem_id(); + struct kmem_cache_node *n; + LIST_HEAD(list); check_irq_off(); ac = cpu_cache_get(cachep); - spin_lock(&cachep->node[node]->list_lock); - free_block(cachep, ac->entry, ac->avail, node); - spin_unlock(&cachep->node[node]->list_lock); + n = get_node(cachep, node); + spin_lock(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail = 0; } @@ -2388,17 +2364,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) on_each_cpu(do_drain, cachep, 1); check_irq_on(); - for_each_online_node(node) { - n = cachep->node[node]; - if (n && n->alien) + for_each_kmem_cache_node(cachep, node, n) + if (n->alien) drain_alien_cache(cachep, n->alien); - } - for_each_online_node(node) { - n = cachep->node[node]; - if (n) - drain_array(cachep, n, n->shared, 1, node); - } + for_each_kmem_cache_node(cachep, node, n) + drain_array(cachep, n, n->shared, 1, node); } /* @@ -2442,20 +2413,16 @@ out: return nr_freed; } -/* Called with slab_mutex held to protect against cpu hotplug */ -static int __cache_shrink(struct kmem_cache *cachep) +int __kmem_cache_shrink(struct kmem_cache *cachep) { - int ret = 0, i = 0; + int ret = 0; + int node; struct kmem_cache_node *n; drain_cpu_caches(cachep); check_irq_on(); - for_each_online_node(i) { - n = cachep->node[i]; - if (!n) - continue; - + for_each_kmem_cache_node(cachep, node, n) { drain_freelist(cachep, n, slabs_tofree(cachep, n)); ret += !list_empty(&n->slabs_full) || @@ -2464,32 +2431,11 @@ static int __cache_shrink(struct kmem_cache *cachep) return (ret ? 1 : 0); } -/** - * kmem_cache_shrink - Shrink a cache. - * @cachep: The cache to shrink. - * - * Releases as many slabs as possible for a cache. - * To help debugging, a zero exit status indicates all slabs were released. - */ -int kmem_cache_shrink(struct kmem_cache *cachep) -{ - int ret; - BUG_ON(!cachep || in_interrupt()); - - get_online_cpus(); - mutex_lock(&slab_mutex); - ret = __cache_shrink(cachep); - mutex_unlock(&slab_mutex); - put_online_cpus(); - return ret; -} -EXPORT_SYMBOL(kmem_cache_shrink); - int __kmem_cache_shutdown(struct kmem_cache *cachep) { int i; struct kmem_cache_node *n; - int rc = __cache_shrink(cachep); + int rc = __kmem_cache_shrink(cachep); if (rc) return rc; @@ -2498,27 +2444,28 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) kfree(cachep->array[i]); /* NUMA: free the node structures */ - for_each_online_node(i) { - n = cachep->node[i]; - if (n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - } + for_each_kmem_cache_node(cachep, i, n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[i] = NULL; } return 0; } /* * Get the memory for a slab management obj. - * For a slab cache when the slab descriptor is off-slab, slab descriptors - * always come from malloc_sizes caches. The slab descriptor cannot - * come from the same cache which is getting created because, - * when we are searching for an appropriate cache for these - * descriptors in kmem_cache_create, we search through the malloc_sizes array. - * If we are creating a malloc_sizes cache here it would not be visible to - * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have freelist_cache same as the original cache. + * + * For a slab cache when the slab descriptor is off-slab, the + * slab descriptor can't come from the same cache which is being created, + * Because if it is the case, that means we defer the creation of + * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. + * And we eventually call down to __kmem_cache_create(), which + * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. + * This is a "chicken-and-egg" problem. + * + * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, + * which are all initialized during kmem_cache_init(). */ static void *alloc_slabmgmt(struct kmem_cache *cachep, struct page *page, int colour_off, @@ -2542,9 +2489,15 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, return freelist; } -static inline unsigned int *slab_freelist(struct page *page) +static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) +{ + return ((freelist_idx_t *)page->freelist)[idx]; +} + +static inline void set_free_obj(struct page *page, + unsigned int idx, freelist_idx_t val) { - return (unsigned int *)(page->freelist); + ((freelist_idx_t *)(page->freelist))[idx] = val; } static void cache_init_objs(struct kmem_cache *cachep, @@ -2589,7 +2542,8 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_freelist(page)[i] = i; + set_obj_status(page, i, OBJECT_FREE); + set_free_obj(page, i, i); } } @@ -2608,7 +2562,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, { void *objp; - objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]); + objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; #if DEBUG WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); @@ -2629,7 +2583,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { - if (slab_freelist(page)[i] == objnr) { + if (get_free_obj(page, i) == objnr) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); BUG(); @@ -2637,7 +2591,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, } #endif page->active--; - slab_freelist(page)[page->active] = objnr; + set_free_obj(page, page->active, objnr); } /* @@ -2673,7 +2627,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); spin_lock(&n->list_lock); /* Get colour for the slab, and cal the next value. */ @@ -2797,6 +2751,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); + set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { @@ -2841,7 +2796,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); spin_lock(&n->list_lock); @@ -2886,9 +2841,9 @@ retry: /* move slabp to correct slabp list: */ list_del(&page->lru); if (page->active == cachep->num) - list_add(&page->list, &n->slabs_full); + list_add(&page->lru, &n->slabs_full); else - list_add(&page->list, &n->slabs_partial); + list_add(&page->lru, &n->slabs_partial); } must_grow: @@ -2930,6 +2885,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { + struct page *page; + if (!objp) return objp; if (cachep->flags & SLAB_POISON) { @@ -2960,6 +2917,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } + + page = virt_to_head_page(objp); + set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -2976,7 +2936,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == kmem_cache) + if (unlikely(cachep == kmem_cache)) return false; return should_failslab(cachep->object_size, flags, cachep->flags); @@ -3027,7 +2987,7 @@ out: #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. @@ -3042,7 +3002,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) - nid_alloc = slab_node(); + nid_alloc = mempolicy_slab_node(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3073,8 +3033,8 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(), flags); + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); retry: /* @@ -3085,8 +3045,8 @@ retry: nid = zone_to_nid(zone); if (cpuset_zone_allowed_hardwall(zone, flags) && - cache->node[nid] && - cache->node[nid]->free_objects) { + get_node(cache, nid) && + get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (obj) @@ -3131,7 +3091,7 @@ retry: } } - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return obj; } @@ -3149,7 +3109,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int x; VM_BUG_ON(nodeid > num_online_nodes()); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); BUG_ON(!n); retry: @@ -3220,7 +3180,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (nodeid == NUMA_NO_NODE) nodeid = slab_node; - if (unlikely(!cachep->node[nodeid])) { + if (unlikely(!get_node(cachep, nodeid))) { /* Node not bootstrapped yet */ ptr = fallback_alloc(cachep, flags); goto out; @@ -3245,11 +3205,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, flags); - if (likely(ptr)) + if (likely(ptr)) { kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - - if (unlikely((flags & __GFP_ZERO) && ptr)) - memset(ptr, 0, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(ptr, 0, cachep->object_size); + } return ptr; } @@ -3259,7 +3219,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) { void *objp; - if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; @@ -3310,23 +3270,24 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) flags); prefetchw(objp); - if (likely(objp)) + if (likely(objp)) { kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - - if (unlikely((flags & __GFP_ZERO) && objp)) - memset(objp, 0, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(objp, 0, cachep->object_size); + } return objp; } /* - * Caller needs to acquire correct kmem_list's list_lock + * Caller needs to acquire correct kmem_cache_node's list_lock + * @list: List of detached free slabs should be freed by caller */ -static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) +static void free_block(struct kmem_cache *cachep, void **objpp, + int nr_objects, int node, struct list_head *list) { int i; - struct kmem_cache_node *n; + struct kmem_cache_node *n = get_node(cachep, node); for (i = 0; i < nr_objects; i++) { void *objp; @@ -3336,7 +3297,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, objp = objpp[i]; page = virt_to_head_page(objp); - n = cachep->node[node]; list_del(&page->lru); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp, node); @@ -3347,13 +3307,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, if (page->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; - /* No need to drop any previously held - * lock here, even if we have a off-slab slab - * descriptor it is guaranteed to come from - * a different cache, refer to comments before - * alloc_slabmgmt. - */ - slab_destroy(cachep, page); + list_add_tail(&page->lru, list); } else { list_add(&page->lru, &n->slabs_free); } @@ -3372,13 +3326,14 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) int batchcount; struct kmem_cache_node *n; int node = numa_mem_id(); + LIST_HEAD(list); batchcount = ac->batchcount; #if DEBUG BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - n = cachep->node[node]; + n = get_node(cachep, node); spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; @@ -3393,7 +3348,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, &list); free_done: #if STATS { @@ -3414,6 +3369,7 @@ free_done: } #endif spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } @@ -3574,11 +3530,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, struct kmem_cache *cachep; void *ret; - /* If you want to save a few bytes .text space: replace - * __ with kmem_. - * Then kmalloc uses the uninlined functions instead of the inline - * functions. - */ cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; @@ -3670,12 +3621,12 @@ EXPORT_SYMBOL(kfree); /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ -static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) +static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) { int node; struct kmem_cache_node *n; struct array_cache *new_shared; - struct array_cache **new_alien = NULL; + struct alien_cache **new_alien = NULL; for_each_online_node(node) { @@ -3696,15 +3647,16 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) } } - n = cachep->node[node]; + n = get_node(cachep, node); if (n) { struct array_cache *shared = n->shared; + LIST_HEAD(list); spin_lock_irq(&n->list_lock); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = new_shared; if (!n->alien) { @@ -3714,6 +3666,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(shared); free_alien_cache(new_alien); continue; @@ -3726,8 +3679,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) } kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; n->shared = new_shared; n->alien = new_alien; n->free_limit = (1 + nr_cpus_node(node)) * @@ -3741,9 +3694,8 @@ fail: /* Cache is not active yet. Roll back what we did */ node--; while (node >= 0) { - if (cachep->node[node]) { - n = cachep->node[node]; - + n = get_node(cachep, node); + if (n) { kfree(n->shared); free_alien_cache(n->alien); kfree(n); @@ -3804,16 +3756,24 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; for_each_online_cpu(i) { + LIST_HEAD(list); struct array_cache *ccold = new->new[i]; + int node; + struct kmem_cache_node *n; + if (!ccold) continue; - spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); - spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); + + node = cpu_to_mem(i); + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + free_block(cachep, ccold->entry, ccold->avail, node, &list); + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(ccold); } kfree(new); - return alloc_kmemlist(cachep, gfp); + return alloc_kmem_cache_node(cachep, gfp); } static int do_tune_cpucache(struct kmem_cache *cachep, int limit, @@ -3917,6 +3877,7 @@ skip_setup: static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node) { + LIST_HEAD(list); int tofree; if (!ac || !ac->avail) @@ -3929,12 +3890,13 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &list); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); } } @@ -3969,7 +3931,7 @@ static void cache_reap(struct work_struct *w) * have established with reasonable certainty that * we can do some work if the lock was obtained. */ - n = searchp->node[node]; + n = get_node(searchp, node); reap_alien(searchp, n); @@ -3982,7 +3944,7 @@ static void cache_reap(struct work_struct *w) if (time_after(n->next_reap, jiffies)) goto next; - n->next_reap = jiffies + REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE; drain_array(searchp, n, n->shared, 0, node); @@ -4003,7 +3965,7 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); } #ifdef CONFIG_SLABINFO @@ -4021,10 +3983,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) active_objs = 0; num_slabs = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); @@ -4201,21 +4160,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i, j; + int i; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - bool active = true; - - for (j = page->active; j < c->num; j++) { - /* Skip freed item */ - if (slab_freelist(page)[j] == i) { - active = false; - break; - } - } - if (!active) + if (get_obj_status(page, i) != OBJECT_ACTIVE) continue; if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) @@ -4258,10 +4208,7 @@ static int leaks_show(struct seq_file *m, void *p) x[1] = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); diff --git a/mm/slab.h b/mm/slab.h index 8184a7cde27..0e0fdd36584 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, struct mem_cgroup; #ifdef CONFIG_SLUB struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)); #else static inline struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { return NULL; } #endif @@ -91,6 +91,8 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); +void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; struct file; @@ -119,28 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return !s->memcg_params || s->memcg_params->is_root_cache; } -static inline bool cache_match_memcg(struct kmem_cache *cachep, - struct mem_cgroup *memcg) -{ - return (is_root_cache(cachep) && !memcg) || - (cachep->memcg_params->memcg == memcg); -} - -static inline void memcg_bind_pages(struct kmem_cache *s, int order) -{ - if (!is_root_cache(s)) - atomic_add(1 << order, &s->memcg_params->nr_pages); -} - -static inline void memcg_release_pages(struct kmem_cache *s, int order) -{ - if (is_root_cache(s)) - return; - - if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) - mem_cgroup_destroy_cache(s); -} - static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { @@ -198,24 +178,29 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; return s->memcg_params->root_cache; } -#else -static inline bool is_root_cache(struct kmem_cache *s) -{ - return true; -} -static inline bool cache_match_memcg(struct kmem_cache *cachep, - struct mem_cgroup *memcg) +static __always_inline int memcg_charge_slab(struct kmem_cache *s, + gfp_t gfp, int order) { - return true; + if (!memcg_kmem_enabled()) + return 0; + if (is_root_cache(s)) + return 0; + return __memcg_charge_slab(s, gfp, order); } -static inline void memcg_bind_pages(struct kmem_cache *s, int order) +static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) { + if (!memcg_kmem_enabled()) + return; + if (is_root_cache(s)) + return; + __memcg_uncharge_slab(s, order); } - -static inline void memcg_release_pages(struct kmem_cache *s, int order) +#else +static inline bool is_root_cache(struct kmem_cache *s) { + return true; } static inline bool slab_equal_or_root(struct kmem_cache *s, @@ -239,6 +224,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { return s; } + +static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +{ + return 0; +} + +static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ +} #endif static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) @@ -262,13 +256,12 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __FUNCTION__, cachep->name, s->name); + __func__, cachep->name, s->name); WARN_ON_ONCE(1); return s; } -#endif - +#ifndef CONFIG_SLOB /* * The slab lists for all objects. */ @@ -283,7 +276,7 @@ struct kmem_cache_node { unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ + struct alien_cache **alien; /* on other nodes */ unsigned long next_reap; /* updated without locking */ int free_touched; /* updated without locking */ #endif @@ -300,5 +293,22 @@ struct kmem_cache_node { }; +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ + if (__n) + +#endif + void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); + +#endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 1ec3c619ba0..d319502b240 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -19,6 +19,8 @@ #include <asm/tlbflush.h> #include <asm/page.h> #include <linux/memcontrol.h> + +#define CREATE_TRACE_POINTS #include <trace/events/kmem.h> #include "slab.h" @@ -29,8 +31,7 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, - size_t size) +static int kmem_cache_sanity_check(const char *name, size_t size) { struct kmem_cache *s = NULL; @@ -56,14 +57,8 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, continue; } -#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) - /* - * For simplicity, we won't check this in the list of memcg - * caches. We have control over memcg naming, and if there - * aren't duplicates in the global list, there won't be any - * duplicates in the memcg lists as well. - */ - if (!memcg && !strcmp(s->name, name)) { +#if !defined(CONFIG_SLUB) + if (!strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); dump_stack(); @@ -77,8 +72,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, return 0; } #else -static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, - const char *name, size_t size) +static inline int kmem_cache_sanity_check(const char *name, size_t size) { return 0; } @@ -139,6 +133,45 @@ unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } +static struct kmem_cache * +do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *), + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct kmem_cache *s; + int err; + + err = -ENOMEM; + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); + if (!s) + goto out; + + s->name = name; + s->object_size = object_size; + s->size = size; + s->align = align; + s->ctor = ctor; + + err = memcg_alloc_cache_params(memcg, s, root_cache); + if (err) + goto out_free_cache; + + err = __kmem_cache_create(s, flags); + if (err) + goto out_free_cache; + + s->refcount = 1; + list_add(&s->list, &slab_caches); +out: + if (err) + return ERR_PTR(err); + return s; + +out_free_cache: + memcg_free_cache_params(s); + kfree(s); + goto out; +} /* * kmem_cache_create - Create a cache. @@ -164,34 +197,23 @@ unsigned long calculate_alignment(unsigned long flags, * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ - struct kmem_cache * -kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *), - struct kmem_cache *parent_cache) +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s = NULL; + struct kmem_cache *s; + char *cache_name; int err; get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); - err = kmem_cache_sanity_check(memcg, name, size); + err = kmem_cache_sanity_check(name, size); if (err) goto out_unlock; - if (memcg) { - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can - * try to create the same cache, but only one of them may - * succeed. Therefore if we get here and see the cache has - * already been created, we silently return NULL. - */ - if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg))) - goto out_unlock; - } - /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this @@ -200,50 +222,31 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, */ flags &= CACHE_CREATE_MASK; - s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); + s = __kmem_cache_alias(name, size, align, flags, ctor); if (s) goto out_unlock; - err = -ENOMEM; - s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); - if (!s) + cache_name = kstrdup(name, GFP_KERNEL); + if (!cache_name) { + err = -ENOMEM; goto out_unlock; + } - s->object_size = s->size = size; - s->align = calculate_alignment(flags, align, size); - s->ctor = ctor; - - s->name = kstrdup(name, GFP_KERNEL); - if (!s->name) - goto out_free_cache; - - err = memcg_alloc_cache_params(memcg, s, parent_cache); - if (err) - goto out_free_cache; - - err = __kmem_cache_create(s, flags); - if (err) - goto out_free_cache; - - s->refcount = 1; - list_add(&s->list, &slab_caches); - memcg_register_cache(s); + s = do_kmem_cache_create(cache_name, size, size, + calculate_alignment(flags, align, size), + flags, ctor, NULL, NULL); + if (IS_ERR(s)) { + err = PTR_ERR(s); + kfree(cache_name); + } out_unlock: mutex_unlock(&slab_mutex); + + put_online_mems(); put_online_cpus(); if (err) { - /* - * There is no point in flooding logs with warnings or - * especially crashing the system if we fail to create a cache - * for a memcg. In this case we will be accounting the memcg - * allocation to the root cgroup until we succeed to create its - * own cache, but it isn't that critical. - */ - if (!memcg) - return NULL; - if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); @@ -255,56 +258,145 @@ out_unlock: return NULL; } return s; +} +EXPORT_SYMBOL(kmem_cache_create); -out_free_cache: - memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); - goto out_unlock; +#ifdef CONFIG_MEMCG_KMEM +/* + * memcg_create_kmem_cache - Create a cache for a memory cgroup. + * @memcg: The memory cgroup the new cache is for. + * @root_cache: The parent of the new cache. + * @memcg_name: The name of the memory cgroup (used for naming the new cache). + * + * This function attempts to create a kmem cache that will serve allocation + * requests going from @memcg to @root_cache. The new cache inherits properties + * from its parent. + */ +struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache, + const char *memcg_name) +{ + struct kmem_cache *s = NULL; + char *cache_name; + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + + cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + memcg_cache_id(memcg), memcg_name); + if (!cache_name) + goto out_unlock; + + s = do_kmem_cache_create(cache_name, root_cache->object_size, + root_cache->size, root_cache->align, + root_cache->flags, root_cache->ctor, + memcg, root_cache); + if (IS_ERR(s)) { + kfree(cache_name); + s = NULL; + } + +out_unlock: + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + return s; } -struct kmem_cache * -kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +static int memcg_cleanup_cache_params(struct kmem_cache *s) { - return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); + int rc; + + if (!s->memcg_params || + !s->memcg_params->is_root_cache) + return 0; + + mutex_unlock(&slab_mutex); + rc = __memcg_cleanup_cache_params(s); + mutex_lock(&slab_mutex); + + return rc; } -EXPORT_SYMBOL(kmem_cache_create); +#else +static int memcg_cleanup_cache_params(struct kmem_cache *s) +{ + return 0; +} +#endif /* CONFIG_MEMCG_KMEM */ -void kmem_cache_destroy(struct kmem_cache *s) +void slab_kmem_cache_release(struct kmem_cache *s) { - /* Destroy all the children caches if we aren't a memcg cache */ - kmem_cache_destroy_memcg_children(s); + kfree(s->name); + kmem_cache_free(kmem_cache, s); +} +void kmem_cache_destroy(struct kmem_cache *s) +{ get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); + s->refcount--; - if (!s->refcount) { - list_del(&s->list); - - if (!__kmem_cache_shutdown(s)) { - memcg_unregister_cache(s); - mutex_unlock(&slab_mutex); - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - - memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); - } else { - list_add(&s->list, &slab_caches); - mutex_unlock(&slab_mutex); - printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", - s->name); - dump_stack(); - } - } else { - mutex_unlock(&slab_mutex); + if (s->refcount) + goto out_unlock; + + if (memcg_cleanup_cache_params(s) != 0) + goto out_unlock; + + if (__kmem_cache_shutdown(s) != 0) { + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + goto out_unlock; } + + list_del(&s->list); + + mutex_unlock(&slab_mutex); + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); + + memcg_free_cache_params(s); +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif + goto out; + +out_unlock: + mutex_unlock(&slab_mutex); +out: + put_online_mems(); put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + */ +int kmem_cache_shrink(struct kmem_cache *cachep) +{ + int ret; + + get_online_cpus(); + get_online_mems(); + ret = __kmem_cache_shrink(cachep); + put_online_mems(); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL(kmem_cache_shrink); + int slab_is_available(void) { return slab_state >= UP; @@ -519,6 +611,24 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + #ifdef CONFIG_TRACING void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { @@ -679,3 +789,102 @@ static int __init slab_proc_init(void) } module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ + +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + +/** + * __krealloc - like krealloc() but don't free @p. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. + */ +void *__krealloc(const void *p, size_t new_size, gfp_t flags) +{ + if (unlikely(!new_size)) + return ZERO_SIZE_PTR; + + return __do_krealloc(p, new_size, flags); + +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + +/* Tracepoints definitions. */ +EXPORT_TRACEPOINT_SYMBOL(kmalloc); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); +EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); +EXPORT_TRACEPOINT_SYMBOL(kfree); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/slob.c b/mm/slob.c index 4bf8809dfcc..21980e0f39a 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -111,13 +111,13 @@ static inline int slob_page_free(struct page *sp) static void set_slob_page_free(struct page *sp, struct list_head *list) { - list_add(&sp->list, list); + list_add(&sp->lru, list); __SetPageSlobFree(sp); } static inline void clear_slob_page_free(struct page *sp) { - list_del(&sp->list); + list_del(&sp->lru); __ClearPageSlobFree(sp); } @@ -282,7 +282,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, slob_list, list) { + list_for_each_entry(sp, slob_list, lru) { #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial @@ -296,7 +296,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) continue; /* Attempt to alloc */ - prev = sp->list.prev; + prev = sp->lru.prev; b = slob_page_alloc(sp, size, align); if (!b) continue; @@ -322,7 +322,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); sp->freelist = b; - INIT_LIST_HEAD(&sp->list); + INIT_LIST_HEAD(&sp->lru); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); b = slob_page_alloc(sp, size, align); @@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c) return 0; } -int kmem_cache_shrink(struct kmem_cache *d) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", diff --git a/mm/slub.c b/mm/slub.c index 25f14ad8f81..3e8afcc07a7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -210,21 +210,22 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *); static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) { } - static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - __this_cpu_inc(s->cpu_slab->stat[si]); + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); #endif } @@ -232,11 +233,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) -{ - return s->node[node]; -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@ -287,6 +283,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) +#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ + for (__p = (__addr), __idx = 1; __idx <= __objects;\ + __p += (__s)->size, __idx++) + /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { @@ -381,9 +381,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@ -402,7 +402,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif return 0; @@ -417,9 +417,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; } else #endif { @@ -443,7 +443,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif return 0; @@ -545,14 +545,14 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); + pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { int i; for (i = 0; i < TRACK_ADDRS_COUNT; i++) if (t->addrs[i]) - printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + pr_err("\t%pS\n", (void *)t->addrs[i]); else break; } @@ -570,38 +570,37 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR - "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", page, page->objects, page->inuse, page->freelist, page->flags); } static void slab_bug(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - printk(KERN_ERR "========================================" - "=====================================\n"); - printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); - printk(KERN_ERR "----------------------------------------" - "-------------------------------------\n\n"); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("-----------------------------------------------------------------------------\n\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + va_end(args); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); va_end(args); - printk(KERN_ERR "FIX %s: %s\n", s->name, buf); } static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) @@ -613,8 +612,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", - p, p - addr, get_freepointer(s, p)); + pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); @@ -697,7 +696,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault[0], value); print_trailer(s, page, object); @@ -930,7 +929,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) { if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", object, page->inuse, @@ -945,60 +944,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } /* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - kmemleak_alloc(ptr, size, 1, flags); -} - -static inline void kfree_hook(const void *x) -{ - kmemleak_free(x); -} - -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) -{ - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); -} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) -{ - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); -} - -/* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache *s, @@ -1133,9 +1078,8 @@ static noinline struct kmem_cache_node *free_debug_processing( slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); } else if (!page->slab_cache) { - printk(KERN_ERR - "SLUB <none>: no slab for object 0x%p.\n", - object); + pr_err("SLUB <none>: no slab for object 0x%p.\n", + object); dump_stack(); } else object_err(s, page, object, @@ -1218,8 +1162,8 @@ static int __init setup_slub_debug(char *str) slub_debug |= SLAB_FAILSLAB; break; default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n", *str); + pr_err("slub_debug option '%c' unknown. skipped\n", + *str); } } @@ -1283,6 +1227,12 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +#endif /* CONFIG_SLUB_DEBUG */ + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); @@ -1294,36 +1244,68 @@ static inline void kfree_hook(const void *x) } static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) + return should_failslab(s->object_size, flags, s->flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, - flags & gfp_allowed_mask); + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -#endif /* CONFIG_SLUB_DEBUG */ + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); +} /* * Slab allocation and freeing */ -static inline struct page *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) +static inline struct page *alloc_slab_page(struct kmem_cache *s, + gfp_t flags, int node, struct kmem_cache_order_objects oo) { + struct page *page; int order = oo_order(oo); flags |= __GFP_NOTRACK; + if (memcg_charge_slab(s, flags, order)) + return NULL; + if (node == NUMA_NO_NODE) - return alloc_pages(flags, order); + page = alloc_pages(flags, order); else - return alloc_pages_exact_node(node, flags, order); + page = alloc_pages_exact_node(node, flags, order); + + if (!page) + memcg_uncharge_slab(s, order); + + return page; } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1345,14 +1327,15 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; + alloc_gfp = flags; /* * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(flags, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (page) stat(s, ORDER_FALLBACK); @@ -1362,7 +1345,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); - kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); /* * Objects from caches that have a constructor don't get @@ -1400,9 +1383,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; void *start; - void *last; void *p; int order; + int idx; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1413,7 +1396,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); - memcg_bind_pages(s, order); page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) @@ -1424,14 +1406,13 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order); - last = start; - for_each_object(p, s, start, page->objects) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); } - setup_object(s, page, last); - set_freepointer(s, last, NULL); page->freelist = start; page->inuse = page->objects; @@ -1464,11 +1445,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - memcg_release_pages(s, order); page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_memcg_kmem_pages(page, order); + __free_pages(page, order); + memcg_uncharge_slab(s, order); } #define need_reserve_slab_rcu \ @@ -1684,8 +1665,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, return NULL; do { - cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(), flags); + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1696,19 +1677,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, object = get_partial_node(s, n, c, flags); if (object) { /* - * Return the object even if - * put_mems_allowed indicated that - * the cpuset mems_allowed was - * updated in parallel. It's a - * harmless race between the alloc - * and the cpuset update. + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update */ - put_mems_allowed(cpuset_mems_cookie); return object; } } } - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); #endif return NULL; } @@ -1720,7 +1699,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu *c) { void *object; - int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; + int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) @@ -1770,19 +1749,19 @@ static inline void note_cmpxchg_failure(const char *n, #ifdef SLUB_DEBUG_CMPXCHG unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + pr_info("%s %s: cmpxchg redo ", n, s->name); #ifdef CONFIG_PREEMPT if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) - printk("due to cpu change %d -> %d\n", + pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); else #endif if (tid_to_event(tid) != tid_to_event(actual_tid)) - printk("due to cpu running other code. Event %ld->%ld\n", + pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); else - printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); @@ -1875,7 +1854,7 @@ redo: new.frozen = 0; - if (!new.inuse && n->nr_partial > s->min_partial) + if (!new.inuse && n->nr_partial >= s->min_partial) m = M_FREE; else if (new.freelist) { m = M_PARTIAL; @@ -1986,7 +1965,7 @@ static void unfreeze_partials(struct kmem_cache *s, new.freelist, new.counters, "unfreezing slab")); - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { page->next = discard_page; discard_page = page; } else { @@ -2121,11 +2100,19 @@ static inline int node_match(struct page *page, int node) return 1; } +#ifdef CONFIG_SLUB_DEBUG static int count_free(struct page *page) { return page->objects - page->inuse; } +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct page *)) { @@ -2139,49 +2126,43 @@ static unsigned long count_partial(struct kmem_cache_node *n, spin_unlock_irqrestore(&n->list_lock, flags); return x; } - -static inline unsigned long node_nr_objs(struct kmem_cache_node *n) -{ -#ifdef CONFIG_SLUB_DEBUG - return atomic_long_read(&n->total_objects); -#else - return 0; -#endif -} +#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { +#ifdef CONFIG_SLUB_DEBUG + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); int node; + struct kmem_cache_node *n; + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; - printk(KERN_WARNING - "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " - "default order: %d, min order: %d\n", s->name, s->object_size, - s->size, oo_order(s->oo), oo_order(s->min)); + pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); if (oo_order(s->min) > get_order(s->object_size)) - printk(KERN_WARNING " %s debugging increased min order, use " - "slub_debug=O to disable.\n", s->name); + pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + s->name); - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free; - if (!n) - continue; - nr_free = count_partial(n, count_free); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); - printk(KERN_WARNING - " node %d: slabs: %ld, objs: %ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } +#endif } static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, @@ -2198,7 +2179,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, page = new_slab(s, flags, node); if (page) { - c = __this_cpu_ptr(s->cpu_slab); + c = raw_cpu_ptr(s->cpu_slab); if (c->page) flush_slab(s, c); @@ -2323,8 +2304,6 @@ redo: if (freelist) goto load_freelist; - stat(s, ALLOC_SLOWPATH); - freelist = get_freelist(s, page); if (!freelist) { @@ -2360,9 +2339,7 @@ new_slab: freelist = new_slab_objects(s, gfpflags, node, &c); if (unlikely(!freelist)) { - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - + slab_out_of_memory(s, gfpflags, node); local_irq_restore(flags); return NULL; } @@ -2418,7 +2395,7 @@ redo: * and the retrieval of the tid. */ preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + c = this_cpu_ptr(s->cpu_slab); /* * The transaction ids are globally unique per cpu and per operation on @@ -2431,10 +2408,10 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) + if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); - - else { + stat(s, ALLOC_SLOWPATH); + } else { void *next_object = get_freepointer_safe(s, object); /* @@ -2613,7 +2590,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; } - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* @@ -2674,7 +2651,7 @@ redo: * during the cmpxchg then the free will succedd. */ preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + c = this_cpu_ptr(s->cpu_slab); tid = c->tid; preempt_enable(); @@ -2894,10 +2871,8 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!page); if (page_to_nid(page) != node) { - printk(KERN_ERR "SLUB: Unable to allocate memory from " - "node %d\n", node); - printk(KERN_ERR "SLUB: Allocating a useless per node structure " - "in order to be able to continue\n"); + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } n = page->freelist; @@ -2923,13 +2898,10 @@ static void early_kmem_cache_node_alloc(int node) static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); - + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; } } @@ -3182,8 +3154,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(slab_index(p, s, addr), map)) { - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", - p, p - addr); + pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } @@ -3218,12 +3189,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) static inline int kmem_cache_close(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; flush_all(s); /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { free_partial(s, n); if (n->nr_partial || slabs_node(s, node)) return 1; @@ -3235,23 +3205,7 @@ static inline int kmem_cache_close(struct kmem_cache *s) int __kmem_cache_shutdown(struct kmem_cache *s) { - int rc = kmem_cache_close(s); - - if (!rc) { - /* - * We do the same lock strategy around sysfs_slab_add, see - * __kmem_cache_create. Because this is pretty much the last - * operation we do and the lock will be released shortly after - * that in slab_common.c, we could just move sysfs_slab_remove - * to a later point in common code. We should do that when we - * have a common sysfs framework for all allocators. - */ - mutex_unlock(&slab_mutex); - sysfs_slab_remove(s); - mutex_lock(&slab_mutex); - } - - return rc; + return kmem_cache_close(s); } /******************************************************************** @@ -3321,8 +3275,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3391,7 +3345,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_memcg_kmem_pages(page, compound_order(page)); + __free_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); @@ -3408,7 +3362,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3424,9 +3378,7 @@ int kmem_cache_shrink(struct kmem_cache *s) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { if (!n->nr_partial) continue; @@ -3464,7 +3416,6 @@ int kmem_cache_shrink(struct kmem_cache *s) kfree(slabs_by_inuse); return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); static int slab_mem_going_offline_callback(void *arg) { @@ -3472,7 +3423,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - kmem_cache_shrink(s); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; @@ -3599,6 +3550,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; memcpy(s, static_cache, kmem_cache->object_size); @@ -3608,19 +3560,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { struct page *p; - if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif - } } list_add(&s->list, &slab_caches); return s; @@ -3666,9 +3615,7 @@ void __init kmem_cache_init(void) register_cpu_notifier(&slab_notifier); #endif - printk(KERN_INFO - "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," - " CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -3686,6 +3633,9 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; + if (!is_root_cache(s)) + return 1; + if (s->ctor) return 1; @@ -3698,9 +3648,8 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, - size_t align, unsigned long flags, const char *name, - void (*ctor)(void *)) +static struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -3723,7 +3672,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, continue; if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; + continue; /* * Check if alignment is compatible. * Courtesy of Adrian Drzewiecki @@ -3734,23 +3683,24 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, if (s->size - size >= sizeof(void *)) continue; - if (!cache_match_memcg(s, memcg)) - continue; - return s; } return NULL; } struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(memcg, size, align, flags, name, ctor); + s = find_mergeable(size, align, flags, name, ctor); if (s) { + int i; + struct kmem_cache *c; + s->refcount++; + /* * Adjust the object sizes so that we clear * the complete object on kzalloc. @@ -3758,6 +3708,15 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); + if (!c) + continue; + c->object_size = s->object_size; + c->inuse = max_t(int, c->inuse, + ALIGN(size, sizeof(void *))); + } + if (sysfs_slab_alias(s, name)) { s->refcount--; s = NULL; @@ -3780,10 +3739,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) return 0; memcg_propagate_slab_attrs(s); - mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); - mutex_lock(&slab_mutex); - if (err) kmem_cache_close(s); @@ -3941,8 +3897,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != n->nr_partial) - printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " - "counter=%ld\n", s->name, count, n->nr_partial); + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); if (!(s->flags & SLAB_STORE_USER)) goto out; @@ -3952,9 +3908,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != atomic_long_read(&n->nr_slabs)) - printk(KERN_ERR "SLUB: %s %ld slabs counted but " - "counter=%ld\n", s->name, count, - atomic_long_read(&n->nr_slabs)); + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, atomic_long_read(&n->nr_slabs)); out: spin_unlock_irqrestore(&n->list_lock, flags); @@ -3967,16 +3922,14 @@ static long validate_slab_cache(struct kmem_cache *s) unsigned long count = 0; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - } kfree(map); return count; } @@ -4130,6 +4083,7 @@ static int list_locations(struct kmem_cache *s, char *buf, int node; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_TEMPORARY)) { @@ -4139,8 +4093,7 @@ static int list_locations(struct kmem_cache *s, char *buf, /* Push back cpu slabs */ flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page; @@ -4212,59 +4165,56 @@ static int list_locations(struct kmem_cache *s, char *buf, #endif #ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) +static void __init resiliency_test(void) { u8 *p; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); + pr_err("SLUB resiliency testing\n"); + pr_err("-----------------------\n"); + pr_err("A. Corruption after allocation\n"); p = kzalloc(16, GFP_KERNEL); p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); + pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", + p + 16); validate_slab_cache(kmalloc_caches[4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[6]); - printk(KERN_ERR "\nB. Corruption after free\n"); + pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); + pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[9]); } #else @@ -4339,11 +4289,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } - lock_memory_hotplug(); + get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = atomic_long_read(&n->total_objects); @@ -4359,9 +4310,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@ -4374,12 +4325,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - unlock_memory_hotplug(); + put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } @@ -4388,16 +4339,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) if (atomic_long_read(&n->total_objects)) return 1; - } + return 0; } #endif @@ -4519,7 +4466,7 @@ SLAB_ATTR_RO(ctor); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount - 1); + return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); } SLAB_ATTR_RO(aliases); @@ -5058,15 +5005,18 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) #ifdef CONFIG_MEMCG_KMEM int i; char *buffer = NULL; + struct kmem_cache *root_cache; - if (!is_root_cache(s)) + if (is_root_cache(s)) return; + root_cache = s->memcg_params->root_cache; + /* * This mean this cache had no attribute written. Therefore, no point * in copying default values around */ - if (!s->max_attr_size) + if (!root_cache->max_attr_size) return; for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { @@ -5088,7 +5038,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) */ if (buffer) buf = buffer; - else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5097,7 +5047,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) buf = buffer; } - attr->show(s->memcg_params->root_cache, buf); + attr->show(root_cache, buf); attr->store(s, buf, strlen(buf)); } @@ -5106,6 +5056,11 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) #endif } +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5113,6 +5068,7 @@ static const struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release, }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -5130,6 +5086,15 @@ static const struct kset_uevent_ops slab_uevent_ops = { static struct kset *slab_kset; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + return s->memcg_params->root_cache->memcg_kset; +#endif + return slab_kset; +} + #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: @@ -5163,12 +5128,6 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = '-'; p += sprintf(p, "%07d", s->size); -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - p += sprintf(p, "-%08d", - memcg_cache_id(s->memcg_params->memcg)); -#endif - BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } @@ -5195,29 +5154,42 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = slab_kset; + s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); - if (err) { - kobject_put(&s->kobj); - return err; - } + if (err) + goto out_put_kobj; err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return err; + if (err) + goto out_del_kobj; + +#ifdef CONFIG_MEMCG_KMEM + if (is_root_cache(s)) { + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); + if (!s->memcg_kset) { + err = -ENOMEM; + goto out_del_kobj; + } } +#endif + kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); - kfree(name); } - return 0; +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); +out_put_kobj: + kobject_put(&s->kobj); + goto out; } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove(struct kmem_cache *s) { if (slab_state < FULL) /* @@ -5226,6 +5198,9 @@ static void sysfs_slab_remove(struct kmem_cache *s) */ return; +#ifdef CONFIG_MEMCG_KMEM + kset_unregister(s->memcg_kset); +#endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); @@ -5276,7 +5251,7 @@ static int __init slab_sysfs_init(void) slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); - printk(KERN_ERR "Cannot register slab subsystem.\n"); + pr_err("Cannot register slab subsystem.\n"); return -ENOSYS; } @@ -5285,8 +5260,8 @@ static int __init slab_sysfs_init(void) list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab %s" - " to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); } while (alias_list) { @@ -5295,8 +5270,8 @@ static int __init slab_sysfs_init(void) alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", al->name); + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); kfree(al); } @@ -5318,13 +5293,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); diff --git a/mm/sparse.c b/mm/sparse.c index 63c3ea5c119..d1b48b691ac 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,10 +5,12 @@ #include <linux/slab.h> #include <linux/mmzone.h> #include <linux/bootmem.h> +#include <linux/compiler.h> #include <linux/highmem.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> + #include "internal.h" #include <asm/dma.h> #include <asm/pgalloc.h> @@ -268,7 +270,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while - * other sections referencing the usemap retmain active. Similarly, + * other sections referencing the usemap remain active. Similarly, * a pgdat can prevent a section being removed. If section A * contains a pgdat and section B contains the usemap, both * sections become inter-dependent. This allocates usemaps @@ -461,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) } #endif -void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) +void __weak __meminit vmemmap_populate_print_last(void) { } diff --git a/mm/swap.c b/mm/swap.c index 0092097b3f4..6b2dc3897cd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -62,12 +62,13 @@ static void __page_cache_release(struct page *page) del_page_from_lru_list(page, lruvec, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } + mem_cgroup_uncharge(page); } static void __put_single_page(struct page *page) { __page_cache_release(page); - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); } static void __put_compound_page(struct page *page) @@ -79,95 +80,88 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } -static void put_compound_page(struct page *page) +/** + * Two special cases here: we could avoid taking compound_lock_irqsave + * and could skip the tail refcounting(in _mapcount). + * + * 1. Hugetlbfs page: + * + * PageHeadHuge will remain true until the compound page + * is released and enters the buddy allocator, and it could + * not be split by __split_huge_page_refcount(). + * + * So if we see PageHeadHuge set, and we have the tail page pin, + * then we could safely put head page. + * + * 2. Slab THP page: + * + * PG_slab is cleared before the slab frees the head page, and + * tail pin cannot be the last reference left on the head page, + * because the slab code is free to reuse the compound page + * after a kfree/kmem_cache_free without having to check if + * there's any tail pin left. In turn all tail pinsmust be always + * released while the head is still pinned by the slab code + * and so we know PG_slab will be still set too. + * + * So if we see PageSlab set, and we have the tail page pin, + * then we could safely put head page. + */ +static __always_inline +void put_unrefcounted_compound_page(struct page *page_head, struct page *page) { - struct page *page_head; - - if (likely(!PageTail(page))) { - if (put_page_testzero(page)) { - /* - * By the time all refcounts have been released - * split_huge_page cannot run anymore from under us. - */ - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); - } - return; - } - - /* __split_huge_page_refcount can run under us */ - page_head = compound_head(page); - /* - * THP can not break up slab pages so avoid taking - * compound_lock() and skip the tail page refcounting (in - * _mapcount) too. Slab performs non-atomic bit ops on - * page->flags for better performance. In particular - * slab_unlock() in slub used to be a hot path. It is still - * hot on arches that do not support - * this_cpu_cmpxchg_double(). - * - * If "page" is part of a slab or hugetlbfs page it cannot be - * splitted and the head page cannot change from under us. And - * if "page" is part of a THP page under splitting, if the - * head page pointed by the THP tail isn't a THP head anymore, - * we'll find PageTail clear after smp_rmb() and we'll treat - * it as a single page. + * If @page is a THP tail, we must read the tail page + * flags after the head page flags. The + * __split_huge_page_refcount side enforces write memory barriers + * between clearing PageTail and before the head page + * can be freed and reallocated. */ - if (!__compound_tail_refcounted(page_head)) { + smp_rmb(); + if (likely(PageTail(page))) { /* - * If "page" is a THP tail, we must read the tail page - * flags after the head page flags. The - * split_huge_page side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. + * __split_huge_page_refcount cannot race + * here, see the comment above this function. */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab - * compound page, the tail pin must - * not be the last reference held on - * the page, because the PG_slab - * cannot be cleared before all tail - * pins (which skips the _mapcount - * tail refcounting) have been - * released. For hugetlbfs the tail - * pin may be the last reference on - * the page instead, because - * PageHeadHuge will not go away until - * the compound page enters the buddy - * allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - return; - } else + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); + VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); + if (put_page_testzero(page_head)) { /* - * __split_huge_page_refcount run before us, - * "page" was a THP tail. The split page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). + * If this is the tail of a slab THP page, + * the tail pin must not be the last reference + * held on the page, because the PG_slab cannot + * be cleared before all tail pins (which skips + * the _mapcount tail refcounting) have been + * released. + * + * If this is the tail of a hugetlbfs page, + * the tail pin may be the last reference on + * the page instead, because PageHeadHuge will + * not go away until the compound page enters + * the buddy allocator. */ - goto out_put_single; - } + VM_BUG_ON_PAGE(PageSlab(page_head), page_head); + __put_compound_page(page_head); + } + } else + /* + * __split_huge_page_refcount run before us, + * @page was a THP tail. The split @page_head + * has been freed and reallocated as slab or + * hugetlbfs page of smaller order (only + * possible if reallocated as slab on x86). + */ + if (put_page_testzero(page)) + __put_single_page(page); +} +static __always_inline +void put_refcounted_compound_page(struct page *page_head, struct page *page) +{ if (likely(page != page_head && get_page_unless_zero(page_head))) { unsigned long flags; /* - * page_head wasn't a dangling pointer but it may not + * @page_head wasn't a dangling pointer but it may not * be a head page anymore by the time we obtain the * lock. That is ok as long as it can't be freed from * under us. @@ -178,7 +172,7 @@ static void put_compound_page(struct page *page) compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { /* - * The head page may have been freed + * The @page_head may have been freed * and reallocated as a compound page * of smaller order and then freed * again. All we know is that it @@ -222,12 +216,51 @@ out_put_single: __put_single_page(page_head); } } else { - /* page_head is a dangling pointer */ + /* @page_head is a dangling pointer */ VM_BUG_ON_PAGE(PageTail(page), page); goto out_put_single; } } +static void put_compound_page(struct page *page) +{ + struct page *page_head; + + /* + * We see the PageCompound set and PageTail not set, so @page maybe: + * 1. hugetlbfs head page, or + * 2. THP head page. + */ + if (likely(!PageTail(page))) { + if (put_page_testzero(page)) { + /* + * By the time all refcounts have been released + * split_huge_page cannot run anymore from under us. + */ + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); + } + return; + } + + /* + * We see the PageCompound set and PageTail set, so @page maybe: + * 1. a tail hugetlbfs page, or + * 2. a tail THP page, or + * 3. a split THP page. + * + * Case 3 is possible, as we may race with + * __split_huge_page_refcount tearing down a THP page. + */ + page_head = compound_head_by_tail(page); + if (!__compound_tail_refcounted(page_head)) + put_unrefcounted_compound_page(page_head, page); + else + put_refcounted_compound_page(page_head, page); +} + void put_page(struct page *page) { if (unlikely(PageCompound(page))) @@ -441,7 +474,7 @@ void rotate_reclaimable_page(struct page *page) page_cache_get(page); local_irq_save(flags); - pvec = &__get_cpu_var(lru_rotate_pvecs); + pvec = this_cpu_ptr(&lru_rotate_pvecs); if (!pagevec_add(pvec, page)) pagevec_move_tail(pvec); local_irq_restore(flags); @@ -469,7 +502,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); - trace_mm_lru_activate(page, page_to_pfn(page)); + trace_mm_lru_activate(page); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -557,6 +590,9 @@ static void __lru_cache_activate_page(struct page *page) * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced + * + * When a newly allocated page is not yet visible, so safe for non-atomic ops, + * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void mark_page_accessed(struct page *page) { @@ -574,19 +610,15 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); + if (page_is_file_cache(page)) + workingset_activation(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); } } EXPORT_SYMBOL(mark_page_accessed); -/* - * Queue the page for addition to the LRU via pagevec. The decision on whether - * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of __lru_cache_add() - * have the page added to the active list using mark_page_accessed(). - */ -void __lru_cache_add(struct page *page) +static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); @@ -596,11 +628,34 @@ void __lru_cache_add(struct page *page) pagevec_add(pvec, page); put_cpu_var(lru_add_pvec); } -EXPORT_SYMBOL(__lru_cache_add); + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +void lru_cache_add_anon(struct page *page) +{ + if (PageActive(page)) + ClearPageActive(page); + __lru_cache_add(page); +} + +void lru_cache_add_file(struct page *page) +{ + if (PageActive(page)) + ClearPageActive(page); + __lru_cache_add(page); +} +EXPORT_SYMBOL(lru_cache_add_file); /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. + * + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of lru_cache_add() + * have the page added to the active list using mark_page_accessed(). */ void lru_cache_add(struct page *page) { @@ -633,6 +688,40 @@ void add_page_to_unevictable_list(struct page *page) spin_unlock_irq(&zone->lru_lock); } +/** + * lru_cache_add_active_or_unevictable + * @page: the page to be added to LRU + * @vma: vma in which page is mapped for determining reclaimability + * + * Place @page on the active or unevictable LRU list, depending on its + * evictability. Note that if the page is not evictable, it goes + * directly back onto it's zone's unevictable list, it does NOT use a + * per cpu pagevec. + */ +void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { + SetPageActive(page); + lru_cache_add(page); + return; + } + + if (!TestSetPageMlocked(page)) { + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + add_page_to_unevictable_list(page); +} + /* * If the page can not be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the @@ -811,7 +900,7 @@ void lru_add_drain_all(void) * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() * will free it. */ -void release_pages(struct page **pages, int nr, int cold) +void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); @@ -852,13 +941,14 @@ void release_pages(struct page **pages, int nr, int cold) } /* Clear Active bit in case of parallel mark_page_accessed */ - ClearPageActive(page); + __ClearPageActive(page); list_add(&page->lru, &pages_to_free); } if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); + mem_cgroup_uncharge_list(&pages_to_free); free_hot_cold_page_list(&pages_to_free, cold); } EXPORT_SYMBOL(release_pages); @@ -934,7 +1024,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); - trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); + trace_mm_lru_insertion(page, lru); } /* @@ -948,6 +1038,57 @@ void __pagevec_lru_add(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_lru_add); /** + * pagevec_lookup_entries - gang pagecache lookup + * @pvec: Where the resulting entries are placed + * @mapping: The address_space to search + * @start: The starting entry index + * @nr_entries: The maximum number of entries + * @indices: The cache indices corresponding to the entries in @pvec + * + * pagevec_lookup_entries() will search for and return a group of up + * to @nr_entries pages and shadow entries in the mapping. All + * entries are placed in @pvec. pagevec_lookup_entries() takes a + * reference against actual pages in @pvec. + * + * The search returns a group of mapping-contiguous entries with + * ascending indexes. There may be holes in the indices due to + * not-present entries. + * + * pagevec_lookup_entries() returns the number of entries which were + * found. + */ +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_pages, + pgoff_t *indices) +{ + pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->pages, indices); + return pagevec_count(pvec); +} + +/** + * pagevec_remove_exceptionals - pagevec exceptionals pruning + * @pvec: The pagevec to prune + * + * pagevec_lookup_entries() fills both pages and exceptional radix + * tree entries into the pagevec. This function prunes all + * exceptionals from @pvec without leaving holes, so that it can be + * passed on to page-only pagevec operations. + */ +void pagevec_remove_exceptionals(struct pagevec *pvec) +{ + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; +} + +/** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search diff --git a/mm/swap_state.c b/mm/swap_state.c index e76ace30d43..3e0ec83d000 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -39,6 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = { struct address_space swapper_spaces[MAX_SWAPFILES] = { [0 ... MAX_SWAPFILES - 1] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, } @@ -176,7 +177,7 @@ int add_to_swap(struct page *page, struct list_head *list) if (unlikely(PageTransHuge(page))) if (unlikely(split_huge_page_to_list(page, list))) { - swapcache_free(entry, NULL); + swapcache_free(entry); return 0; } @@ -202,7 +203,7 @@ int add_to_swap(struct page *page, struct list_head *list) * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry, NULL); + swapcache_free(entry); return 0; } } @@ -225,7 +226,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - swapcache_free(entry, page); + swapcache_free(entry); page_cache_release(page); } @@ -270,7 +271,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) for (i = 0; i < todo; i++) free_swap_cache(pagep[i]); - release_pages(pagep, todo, 0); + release_pages(pagep, todo, false); pagep += todo; nr -= todo; } @@ -386,7 +387,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry, NULL); + swapcache_free(entry); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4a7f7e6992b..8798b2e0ac5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages; /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; -static atomic_t highest_priority_index = ATOMIC_INIT(-1); static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -struct swap_list_t swap_list = {-1, -1}; +/* + * all active swap_info_structs + * protected with swap_lock, and ordered by priority. + */ +PLIST_HEAD(swap_active_head); + +/* + * all available (active, not full) swap_info_structs + * protected with swap_avail_lock, ordered by priority. + * This is used by get_swap_page() instead of swap_active_head + * because swap_active_head includes all swap_info_structs, + * but get_swap_page() doesn't need to look at full ones. + * This uses its own lock instead of swap_lock because when a + * swap_info_struct changes between not-full/full, it needs to + * add/remove itself to/from this list, but the swap_info_struct->lock + * is held and the locking order requires swap_lock to be taken + * before any swap_info_struct->lock. + */ +static PLIST_HEAD(swap_avail_head); +static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -505,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. - * But if seek is cheap, search from our current position, so - * that swap is allocated from all over the partition: if the - * Flash Translation Layer only remaps within limited zones, - * we don't want to wear out the first zone too quickly. + * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info + * case, just handled by scan_swap_map_try_ssd_cluster() above. */ - if (!(si->flags & SWP_SOLIDSTATE)) - scan_base = offset = si->lowest_bit; + scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ @@ -531,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } } - offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster < scan_base; offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&si->lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -591,6 +586,9 @@ checks: if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } si->swap_map[offset] = usage; inc_cluster_info_page(si, si->cluster_info, offset); @@ -640,71 +638,65 @@ no_page: swp_entry_t get_swap_page(void) { - struct swap_info_struct *si; + struct swap_info_struct *si, *next; pgoff_t offset; - int type, next; - int wrapped = 0; - int hp_index; - spin_lock(&swap_lock); if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; atomic_long_dec(&nr_swap_pages); - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - hp_index = atomic_xchg(&highest_priority_index, -1); - /* - * highest_priority_index records current highest priority swap - * type which just frees swap entries. If its priority is - * higher than that of swap_list.next swap type, we use it. It - * isn't protected by swap_lock, so it can be an invalid value - * if the corresponding swap type is swapoff. We double check - * the flags here. It's even possible the swap type is swapoff - * and swapon again and its priority is changed. In such rare - * case, low prority swap type might be used, but eventually - * high priority swap will be used after several rounds of - * swap. - */ - if (hp_index != -1 && hp_index != type && - swap_info[type]->prio < swap_info[hp_index]->prio && - (swap_info[hp_index]->flags & SWP_WRITEOK)) { - type = hp_index; - swap_list.next = type; - } - - si = swap_info[type]; - next = si->next; - if (next < 0 || - (!wrapped && si->prio != swap_info[next]->prio)) { - next = swap_list.head; - wrapped++; - } + spin_lock(&swap_avail_lock); +start_over: + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + /* requeue si to after same-priority siblings */ + plist_requeue(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&si->lock); - if (!si->highest_bit) { - spin_unlock(&si->lock); - continue; - } - if (!(si->flags & SWP_WRITEOK)) { + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + if (plist_node_empty(&si->avail_list)) { + spin_unlock(&si->lock); + goto nextsi; + } + WARN(!si->highest_bit, + "swap_info %d in list but !highest_bit\n", + si->type); + WARN(!(si->flags & SWP_WRITEOK), + "swap_info %d in list but !SWP_WRITEOK\n", + si->type); + plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&si->lock); - continue; + goto nextsi; } - swap_list.next = next; - - spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); spin_unlock(&si->lock); if (offset) - return swp_entry(type, offset); - spin_lock(&swap_lock); - next = swap_list.next; + return swp_entry(si->type, offset); + pr_debug("scan_swap_map of si %d failed to find offset\n", + si->type); + spin_lock(&swap_avail_lock); +nextsi: + /* + * if we got here, it's likely that si was almost full before, + * and since scan_swap_map() can drop the si->lock, multiple + * callers probably all tried to get a page from the same si + * and it filled up before we could get one; or, the si filled + * up between us dropping swap_avail_lock and taking si->lock. + * Since we dropped the swap_avail_lock, the swap_avail_head + * list may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over. + */ + if (plist_node_empty(&next->avail_list)) + goto start_over; } + spin_unlock(&swap_avail_lock); + atomic_long_inc(&nr_swap_pages); noswap: - spin_unlock(&swap_lock); return (swp_entry_t) {0}; } @@ -766,27 +758,6 @@ out: return NULL; } -/* - * This swap type frees swap entry, check if it is the highest priority swap - * type which just frees swap entry. get_swap_page() uses - * highest_priority_index to search highest priority swap type. The - * swap_info_struct.lock can't protect us if there are multiple swap types - * active, so we use atomic_cmpxchg. - */ -static void set_highest_priority_index(int type) -{ - int old_hp_index, new_hp_index; - - do { - old_hp_index = atomic_read(&highest_priority_index); - if (old_hp_index != -1 && - swap_info[old_hp_index]->prio >= swap_info[type]->prio) - break; - new_hp_index = type; - } while (atomic_cmpxchg(&highest_priority_index, - old_hp_index, new_hp_index) != old_hp_index); -} - static unsigned char swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -828,9 +799,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; - if (offset > p->highest_bit) + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; p->highest_bit = offset; - set_highest_priority_index(p->type); + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } atomic_long_inc(&nr_swap_pages); p->inuse_pages--; frontswap_invalidate_page(p->type, offset); @@ -863,16 +843,13 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void swapcache_free(swp_entry_t entry, struct page *page) +void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; - unsigned char count; p = swap_info_get(entry); if (p) { - count = swap_entry_free(p, entry, SWAP_HAS_CACHE); - if (page) - mem_cgroup_uncharge_swapcache(page, entry, count != 0); + swap_entry_free(p, entry, SWAP_HAS_CACHE); spin_unlock(&p->lock); } } @@ -1126,15 +1103,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, - GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge_swapin(memcg); + mem_cgroup_cancel_charge(page, memcg); ret = 0; goto out; } @@ -1144,11 +1120,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); - if (page == swapcache) + if (page == swapcache) { page_add_anon_rmap(page, vma, addr); - else /* ksm created a completely new copy */ + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge_swapin(page, memcg); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } swap_free(entry); /* * Move the page to the active list so it is not @@ -1765,30 +1744,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - int i, prev; - if (prio >= 0) p->prio = prio; else p->prio = --least_priority; + /* + * the plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low + */ + p->list.prio = -p->prio; + p->avail_list.prio = -p->prio; p->swap_map = swap_map; p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = p->type; - else - swap_info[prev]->next = p->type; + assert_spin_locked(&swap_lock); + /* + * both lists are plists, and thus priority ordered. + * swap_active_head needs to be priority ordered for swapoff(), + * which on removal of any swap_info_struct with an auto-assigned + * (i.e. negative) priority increments the auto-assigned priority + * of any lower-priority swap_info_structs. + * swap_avail_head needs to be priority ordered for get_swap_page(), + * which allocates swap pages from the highest available priority + * swap_info_struct. + */ + plist_add(&p->list, &swap_active_head); + spin_lock(&swap_avail_lock); + plist_add(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -1823,8 +1809,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - int i, type, prev; - int err; + int err, found = 0; unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) @@ -1842,17 +1827,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out; mapping = victim->f_mapping; - prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { - p = swap_info[type]; + plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { - if (p->swap_file->f_mapping == mapping) + if (p->swap_file->f_mapping == mapping) { + found = 1; break; + } } - prev = type; } - if (type < 0) { + if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; @@ -1864,20 +1848,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) - swap_list.head = p->next; - else - swap_info[prev]->next = p->next; - if (type == swap_list.next) { - /* just pick something that's safe... */ - swap_list.next = swap_list.head; - } + spin_lock(&swap_avail_lock); + plist_del(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&p->lock); if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i]->next) - swap_info[i]->prio = p->prio--; + struct swap_info_struct *si = p; + + plist_for_each_entry_continue(si, &swap_active_head, list) { + si->prio++; + si->list.prio--; + si->avail_list.prio--; + } least_priority++; } + plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; @@ -1885,7 +1870,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); set_current_oom_origin(); - err = try_to_unuse(type, false, 0); /* force all pages to be unused */ + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ clear_current_oom_origin(); if (err) { @@ -1926,7 +1911,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); - frontswap_invalidate_area(type); + frontswap_invalidate_area(p->type); frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); @@ -1935,7 +1920,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) vfree(cluster_info); vfree(frontswap_map); /* Destroy swap account information */ - swap_cgroup_swapoff(type); + swap_cgroup_swapoff(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -2142,8 +2127,9 @@ static struct swap_info_struct *alloc_swap_info(void) */ } INIT_LIST_HEAD(&p->first_swap_extent.list); + plist_node_init(&p->list, 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; - p->next = -1; spin_unlock(&swap_lock); spin_lock_init(&p->lock); diff --git a/mm/truncate.c b/mm/truncate.c index 353b683afd6..96d167372d8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -22,6 +22,45 @@ #include <linux/cleancache.h> #include "internal.h" +static void clear_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + struct radix_tree_node *node; + void **slot; + + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrshadows--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); +unlock: + spin_unlock_irq(&mapping->tree_lock); +} /** * do_invalidatepage - invalidate part or all of a page @@ -208,11 +247,12 @@ void truncate_inode_pages_range(struct address_space *mapping, unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0) + if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; /* Offsets within partial pages */ @@ -238,17 +278,22 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_init(&pvec, 0); index = start; - while (index < end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { - mem_cgroup_uncharge_start(); + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -259,8 +304,8 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -307,25 +352,36 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + if (!pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + /* If all gone from start onwards, we're done */ if (index == start) break; + /* Otherwise restart to make sure all gone */ index = start; continue; } - if (index == start && pvec.pages[0]->index >= end) { + if (index == start && indices[0] >= end) { + /* All gone out of hole to be punched, we're done */ + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; - if (index >= end) + index = indices[i]; + if (index >= end) { + /* Restart punch to make sure all gone */ + index = start - 1; break; + } + + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } lock_page(page); WARN_ON(page->index != index); @@ -333,8 +389,8 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); index++; } cleancache_invalidate_inode(mapping); @@ -360,6 +416,53 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) EXPORT_SYMBOL(truncate_inode_pages); /** + * truncate_inode_pages_final - truncate *all* pages before inode dies + * @mapping: mapping to truncate + * + * Called under (and serialized by) inode->i_mutex. + * + * Filesystems have to use this in the .evict_inode path to inform the + * VM that this is the final truncate and the inode is going away. + */ +void truncate_inode_pages_final(struct address_space *mapping) +{ + unsigned long nrshadows; + unsigned long nrpages; + + /* + * Page reclaim can not participate in regular inode lifetime + * management (can't call iput()) and thus can race with the + * inode teardown. Tell it when the address space is exiting, + * so that it does not install eviction information after the + * final truncate has begun. + */ + mapping_set_exiting(mapping); + + /* + * When reclaim installs eviction entries, it increases + * nrshadows first, then decreases nrpages. Make sure we see + * this in the right order or we might miss an entry. + */ + nrpages = mapping->nrpages; + smp_rmb(); + nrshadows = mapping->nrshadows; + + if (nrpages || nrshadows) { + /* + * As truncation uses a lockless tree lookup, cycle + * the tree lock to make sure any ongoing tree + * modification that does not see AS_EXITING is + * completed before starting the final truncate. + */ + spin_lock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + + truncate_inode_pages(mapping, 0); + } +} +EXPORT_SYMBOL(truncate_inode_pages_final); + +/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate @@ -375,32 +478,30 @@ EXPORT_SYMBOL(truncate_inode_pages); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; - /* - * Note: this function may get called on a shmem/tmpfs mapping: - * pagevec_lookup() might then return 0 prematurely (because it - * got a gangful of swap entries); but it's hardly worth worrying - * about - it can rarely have anything to free from such a mapping - * (most pages are dirty), and already skips over any difficulties. - */ - pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { - mem_cgroup_uncharge_start(); + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -414,8 +515,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, deactivate_page(page); count += ret; } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -444,9 +545,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -481,6 +581,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; @@ -491,17 +592,22 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { - mem_cgroup_uncharge_start(); + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); if (page->mapping != mapping) { @@ -539,8 +645,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret = ret2; unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } diff --git a/mm/util.c b/mm/util.c index a24aa22f247..093c973f169 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,6 +1,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> @@ -9,14 +10,12 @@ #include <linux/swapops.h> #include <linux/mman.h> #include <linux/hugetlb.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> #include "internal.h" -#define CREATE_TRACE_POINTS -#include <trace/events/kmem.h> - /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -110,97 +109,6 @@ void *memdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user); -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) -{ - void *ret; - size_t ks = 0; - - if (p) - ks = ksize(p); - - if (ks >= new_size) - return (void *)p; - - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; -} - -/** - * __krealloc - like krealloc() but don't free @p. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * This function is like krealloc() except it never frees the originally - * allocated buffer. Use this if you don't want to free the buffer immediately - * like, for example, with RCU. - */ -void *__krealloc(const void *p, size_t new_size, gfp_t flags) -{ - if (unlikely(!new_size)) - return ZERO_SIZE_PTR; - - return __do_krealloc(p, new_size, flags); - -} -EXPORT_SYMBOL(__krealloc); - -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ret = __do_krealloc(p, new_size, flags); - if (ret && p != ret) - kfree(p); - - return ret; -} -EXPORT_SYMBOL(krealloc); - -/** - * kzfree - like kfree but zero memory - * @p: object to free memory of - * - * The memory of the object @p points to is zeroed before freed. - * If @p is %NULL, kzfree() does nothing. - * - * Note: this function zeroes the whole allocated buffer which can be a good - * deal bigger than the requested buffer size passed to kmalloc(). So be - * careful when using this function in performance sensitive code. - */ -void kzfree(const void *p) -{ - size_t ks; - void *mem = (void *)p; - - if (unlikely(ZERO_OR_NULL_PTR(mem))) - return; - ks = ksize(mem); - memset(mem, 0, ks); - kfree(mem); -} -EXPORT_SYMBOL(kzfree); - /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -275,17 +183,14 @@ pid_t vm_is_stack(struct task_struct *task, if (in_group) { struct task_struct *t; - rcu_read_lock(); - if (!pid_alive(task)) - goto done; - t = task; - do { + rcu_read_lock(); + for_each_thread(task, t) { if (vm_is_stack_for_task(t, vma)) { ret = t->pid; goto done; } - } while_each_thread(task, t); + } done: rcu_read_unlock(); } @@ -307,7 +212,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * If the architecture not support this function, simply return with no * page pinned */ -int __attribute__((weak)) __get_user_pages_fast(unsigned long start, +int __weak __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { return 0; @@ -338,7 +243,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * callers need to carefully consider what to use. On many architectures, * get_user_pages_fast simply falls back to get_user_pages. */ -int __attribute__((weak)) get_user_pages_fast(unsigned long start, +int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; @@ -386,6 +291,15 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + struct address_space *page_mapping(struct page *page) { struct address_space *mapping = page->mapping; @@ -445,11 +359,51 @@ unsigned long vm_commit_limit(void) return allowed; } +/** + * get_cmdline() - copy the cmdline value to a buffer. + * @task: the task whose cmdline value to copy. + * @buffer: the buffer to copy to. + * @buflen: the length of the buffer. Larger cmdline values are truncated + * to this length. + * Returns the size of the cmdline field copied. Note that the copy does + * not guarantee an ending NULL byte. + */ +int get_cmdline(struct task_struct *task, char *buffer, int buflen) +{ + int res = 0; + unsigned int len; + struct mm_struct *mm = get_task_mm(task); + if (!mm) + goto out; + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + + len = mm->arg_end - mm->arg_start; -/* Tracepoints definitions. */ -EXPORT_TRACEPOINT_SYMBOL(kmalloc); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); -EXPORT_TRACEPOINT_SYMBOL(kfree); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); + if (len > buflen) + len = buflen; + + res = access_process_vm(task, mm->arg_start, buffer, len, 0); + + /* + * If the nul at the end of args has been overwritten, then + * assume application is using setproctitle(3). + */ + if (res > 0 && buffer[res-1] != '\0' && len < buflen) { + len = strnlen(buffer, res); + if (len < res) { + res = len; + } else { + len = mm->env_end - mm->env_start; + if (len > buflen - res) + len = buflen - res; + res += access_process_vm(task, mm->env_start, + buffer+res, len, 0); + res = strnlen(buffer, res); + } + } +out_mm: + mmput(mm); +out: + return res; +} diff --git a/mm/vmacache.c b/mm/vmacache.c new file mode 100644 index 00000000000..9f25af825de --- /dev/null +++ b/mm/vmacache.c @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2014 Davidlohr Bueso. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmacache.h> + +/* + * Flush vma caches for threads that share a given mm. + * + * The operation is safe because the caller holds the mmap_sem + * exclusively and other threads accessing the vma cache will + * have mmap_sem held at least for read, so no extra locking + * is required to maintain the vma cache. + */ +void vmacache_flush_all(struct mm_struct *mm) +{ + struct task_struct *g, *p; + + /* + * Single threaded tasks need not iterate the entire + * list of process. We can avoid the flushing as well + * since the mm's seqnum was increased and don't have + * to worry about other threads' seqnum. Current's + * flush will occur upon the next lookup. + */ + if (atomic_read(&mm->mm_users) == 1) + return; + + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * Only flush the vmacache pointers as the + * mm seqnum is already set and curr's will + * be set upon invalidation when the next + * lookup is done. + */ + if (mm == p->mm) + vmacache_flush(p); + } + rcu_read_unlock(); +} + +/* + * This task may be accessing a foreign mm via (for example) + * get_user_pages()->find_vma(). The vmacache is task-local and this + * task's vmacache pertains to a different mm (ie, its own). There is + * nothing we can do here. + * + * Also handle the case where a kernel thread has adopted this mm via use_mm(). + * That kernel thread's vmacache is not applicable to this mm. + */ +static bool vmacache_valid_mm(struct mm_struct *mm) +{ + return current->mm == mm && !(current->flags & PF_KTHREAD); +} + +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) +{ + if (vmacache_valid_mm(newvma->vm_mm)) + current->vmacache[VMACACHE_HASH(addr)] = newvma; +} + +static bool vmacache_valid(struct mm_struct *mm) +{ + struct task_struct *curr; + + if (!vmacache_valid_mm(mm)) + return false; + + curr = current; + if (mm->vmacache_seqnum != curr->vmacache_seqnum) { + /* + * First attempt will always be invalid, initialize + * the new cache for this task here. + */ + curr->vmacache_seqnum = mm->vmacache_seqnum; + vmacache_flush(curr); + return false; + } + return true; +} + +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (!vma) + continue; + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; + if (vma->vm_start <= addr && vma->vm_end > addr) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } + } + + return NULL; +} + +#ifndef CONFIG_MMU +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (vma && vma->vm_start == start && vma->vm_end == end) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } + } + + return NULL; +} +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf96803c5..2b0aa548609 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -27,7 +27,9 @@ #include <linux/pfn.h> #include <linux/kmemleak.h> #include <linux/atomic.h> +#include <linux/compiler.h> #include <linux/llist.h> + #include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -1083,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram); * @node: prefer to allocate data structures on this node * @prot: memory protection to use. PAGE_KERNEL for regular RAM * + * If you use this function for less than VMAP_MAX_ALLOC pages, it could be + * faster than vmap so it's good. But if you mix long-life and short-life + * objects with vm_map_ram(), it could consume lots of address space through + * fragmentation (especially on a 32bit machine). You could see failures in + * the end. Please use this function for short-lived objects. + * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) @@ -1260,20 +1268,17 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } +EXPORT_SYMBOL_GPL(unmap_kernel_range); -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { unsigned long addr = (unsigned long)area->addr; unsigned long end = addr + get_vm_area_size(area); int err; - err = vmap_page_range(addr, end, prot, *pages); - if (err > 0) { - *pages += err; - err = 0; - } + err = vmap_page_range(addr, end, prot, pages); - return err; + return err > 0 ? 0 : err; } EXPORT_SYMBOL_GPL(map_vm_area); @@ -1488,7 +1493,7 @@ void vfree(const void *addr) if (!addr) return; if (unlikely(in_interrupt())) { - struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); + struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); } else @@ -1539,7 +1544,7 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, &pages)) { + if (map_vm_area(area, prot, pages)) { vunmap(area->addr); return NULL; } @@ -1557,7 +1562,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; - gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1580,12 +1586,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; - gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; if (node == NUMA_NO_NODE) - page = alloc_page(tmp_mask); + page = alloc_page(alloc_mask); else - page = alloc_pages_node(node, tmp_mask, order); + page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1593,9 +1598,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; + if (gfp_mask & __GFP_WAIT) + cond_resched(); } - if (map_vm_area(area, prot, &pages)) + if (map_vm_area(area, prot, pages)) goto fail; return area->addr; @@ -2181,7 +2188,7 @@ EXPORT_SYMBOL(remap_vmalloc_range); * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } @@ -2611,19 +2618,19 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); if (v->flags & VM_IOREMAP) - seq_printf(m, " ioremap"); + seq_puts(m, " ioremap"); if (v->flags & VM_ALLOC) - seq_printf(m, " vmalloc"); + seq_puts(m, " vmalloc"); if (v->flags & VM_MAP) - seq_printf(m, " vmap"); + seq_puts(m, " vmap"); if (v->flags & VM_USERMAP) - seq_printf(m, " user"); + seq_puts(m, " user"); if (v->flags & VM_VPAGES) - seq_printf(m, " vpages"); + seq_puts(m, " vpages"); show_numa_info(m, v); seq_putc(m, '\n'); @@ -2681,14 +2688,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) prev_end = VMALLOC_START; - spin_lock(&vmap_area_lock); + rcu_read_lock(); if (list_empty(&vmap_area_list)) { vmi->largest_chunk = VMALLOC_TOTAL; goto out; } - list_for_each_entry(va, &vmap_area_list, list) { + list_for_each_entry_rcu(va, &vmap_area_list, list) { unsigned long addr = va->va_start; /* @@ -2715,7 +2722,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) vmi->largest_chunk = VMALLOC_END - prev_end; out: - spin_unlock(&vmap_area_lock); + rcu_read_unlock(); } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index a9c74b40968..2836b5373b2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,6 +11,8 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/module.h> #include <linux/gfp.h> @@ -43,6 +45,7 @@ #include <linux/sysctl.h> #include <linux/oom.h> #include <linux/prefetch.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -56,32 +59,20 @@ #include <trace/events/vmscan.h> struct scan_control { - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - unsigned long hibernation_mode; - /* This context's GFP mask */ gfp_t gfp_mask; - int may_writepage; - - /* Can mapped pages be reclaimed? */ - int may_unmap; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - + /* Allocation order */ int order; - /* Scan (total_size >> priority) pages at once */ - int priority; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the @@ -89,11 +80,27 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; + /* Scan (total_size >> priority) pages at once */ + int priority; + + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -130,7 +137,11 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -unsigned long vm_total_pages; /* The total number of pages which the VM controls */ +/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -163,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; } static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -224,15 +236,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, unsigned long freed = 0; unsigned long long delta; long total_scan; - long max_pass; + long freeable; long nr; long new_nr; int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; - max_pass = shrinker->count_objects(shrinker, shrinkctl); - if (max_pass == 0) + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) return 0; /* @@ -244,14 +256,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; + delta *= freeable; do_div(delta, lru_pages + 1); total_scan += delta; if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); - total_scan = max_pass; + total_scan = freeable; } /* @@ -260,26 +272,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, * shrinkers to return -1 all the time. This results in a large * nr being built up so when a shrink that can do some work * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in + * freeable. This is bad for sustaining a working set in * memory. * * Hence only allow the shrinker to scan the entire cache when * a large delta change is calculated directly. */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; + if (total_scan > freeable * 2) + total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); + freeable, delta, total_scan); /* * Normally, we should not scan less than batch_size objects in one @@ -292,12 +304,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater - * than the total number of objects on slab (max_pass), we must be + * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || - total_scan >= max_pass) { + total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); @@ -324,7 +336,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); return freed; } @@ -458,7 +470,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in __generic_file_aio_write() against + * If this process is currently in __generic_file_write_iter() against * this page's queue, we can perform writeback even if that * will block. * @@ -477,7 +489,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); - printk("%s: orphaned page\n", __func__); + pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } @@ -523,7 +535,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, + bool reclaimed) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -564,17 +577,30 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; + mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - swapcache_free(swap, page); + swapcache_free(swap); } else { void (*freepage)(struct page *); + void *shadow = NULL; freepage = mapping->a_ops->freepage; - - __delete_from_page_cache(page); + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optizimation, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + */ + if (reclaimed && page_is_file_cache(page) && + !mapping_exiting(mapping)) + shadow = workingset_eviction(mapping, page); + __delete_from_page_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (freepage != NULL) freepage(page); @@ -595,7 +621,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page)) { + if (__remove_mapping(mapping, page, false)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -796,7 +822,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, cond_resched(); - mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -1065,7 +1090,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!mapping || !__remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -1107,11 +1132,12 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - free_hot_cold_page_list(&free_pages, 1); + mem_cgroup_uncharge_list(&free_pages); + free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - mem_cgroup_uncharge_end(); + *ret_nr_dirty += nr_dirty; *ret_nr_congested += nr_congested; *ret_nr_unqueued_dirty += nr_unqueued_dirty; @@ -1144,7 +1170,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; } @@ -1411,6 +1437,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1425,6 +1452,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) } /* + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested. In other cases it is safe to throttle. + */ +static int current_may_throttle(void) +{ + return !(current->flags & PF_LESS_THROTTLE) || + current->backing_dev_info == NULL || + bdi_write_congested(current->backing_dev_info); +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1470,7 +1510,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1505,7 +1545,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&page_list, 1); + mem_cgroup_uncharge_list(&page_list); + free_hot_cold_page_list(&page_list, true); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1540,19 +1581,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing - * pages from reclaim context. It will forcibly stall in the - * next check. + * pages from reclaim context. */ if (nr_unqueued_dirty == nr_taken) zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); /* - * In addition, if kswapd scans pages marked marked for - * immediate reclaim and under writeback (nr_immediate), it - * implies that pages are cycling through the LRU faster than + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it implies + * that pages are cycling through the LRU faster than * they are written so also forcibly stall. */ - if (nr_unqueued_dirty == nr_taken || nr_immediate) + if (nr_immediate && current_may_throttle()) congestion_wait(BLK_RW_ASYNC, HZ/10); } @@ -1561,7 +1601,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * is congested. Allow kswapd to continue until it starts encountering * unqueued dirty pages or cycling through the LRU too quickly. */ - if (!sc->hibernation_mode && !current_is_kswapd()) + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, @@ -1619,6 +1660,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1660,7 +1702,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; @@ -1717,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This * helps balance scan pressure between file and anonymous pages in - * get_scan_ratio. + * get_scan_count. */ reclaim_stat->recent_rotated[file] += nr_rotated; @@ -1726,7 +1768,8 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&l_hold, 1); + mem_cgroup_uncharge_list(&l_hold); + free_hot_cold_page_list(&l_hold, true); } #ifdef CONFIG_SWAP @@ -1816,13 +1859,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -static int vmscan_swappiness(struct scan_control *sc) -{ - if (global_reclaim(sc)) - return vm_swappiness; - return mem_cgroup_swappiness(sc->target_mem_cgroup); -} - enum scan_balance { SCAN_EQUAL, SCAN_FRACT, @@ -1839,8 +1875,8 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *nr) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -1848,10 +1884,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, struct zone *zone = lruvec_zone(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file, free; + unsigned long anon, file; bool force_scan = false; unsigned long ap, fp; enum lru_list lru; + bool some_scanned; + int pass; /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1881,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + if (!global_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1891,25 +1929,29 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && vmscan_swappiness(sc)) { + if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); - /* - * If it's foreseeable that reclaiming the file cache won't be - * enough to get the zone back into a desirable shape, we have - * to swap. Better start now and leave the - probably heavily - * thrashing - remaining file pages alone. + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); - if (unlikely(file + free <= high_wmark_pages(zone))) { + unsigned long zonefile; + unsigned long zonefree; + + zonefree = zone_page_state(zone, NR_FREE_PAGES); + zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { scan_balance = SCAN_ANON; goto out; } @@ -1930,7 +1972,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); + anon_prio = swappiness; file_prio = 200 - anon_prio; /* @@ -1944,6 +1986,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * * anon in [0], file in [1] */ + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; @@ -1971,46 +2019,57 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - size = get_lru_size(lruvec, lru); - scan = size >> sc->priority; + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; - if (!scan && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + nr[lru] = scan; /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. + * Skip the second pass and don't force_scan, + * if we found something to scan. */ - scan = div64_u64(scan * fraction[file], denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) - scan = 0; - break; - default: - /* Look ma, no brain */ - BUG(); + some_scanned |= !!scan; } - nr[lru] = scan; } } /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, int swappiness, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2019,13 +2078,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + bool scan_adjusted; - get_scan_count(lruvec, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2046,17 +2119,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) continue; /* - * For global direct reclaim, reclaim only the number of pages - * requested. Less care is taken to scan proportionally as it - * is more important to minimise direct reclaim stall latency - * than it is to properly age the LRU lists. - */ - if (global_reclaim(sc) && !current_is_kswapd()) - break; - - /* * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs shrink + * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. @@ -2064,6 +2128,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; @@ -2185,9 +2258,10 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2203,10 +2277,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { struct lruvec *lruvec; + int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); + swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, sc); + shrink_lruvec(lruvec, swappiness, sc); /* * Direct reclaim and kswapd have to scan all memory @@ -2230,41 +2306,41 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return reclaimable; } /* Returns true if compaction should go ahead for a high-order request */ -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; bool watermark_ok; - /* Do not consider compaction for orders reclaim is meant to satisfy */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) - return false; - /* * Compaction takes time to run and there are potentially other * callers using the pages just freed. Continue reclaiming until * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, sc->order)) + if (compaction_deferred(zone, order)) return watermark_ok; /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, sc->order)) + if (!compaction_suitable(zone, order)) return false; return watermark_ok; @@ -2286,10 +2362,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. * - * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should consider retrying the allocation instead of - * further reclaim. + * Returns true if a zone was reclaimable. */ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { @@ -2297,16 +2370,26 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - bool aborted_reclaim = false; + unsigned long lru_pages = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; + gfp_t orig_mask; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ + orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; + nodes_clear(shrink.nodes_to_scan); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) @@ -2318,24 +2401,31 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; + + lru_pages += zone_reclaimable_pages(zone); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - if (IS_ENABLED(CONFIG_COMPACTION)) { - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if (compaction_ready(zone, sc)) { - aborted_reclaim = true; - continue; - } + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + zonelist_zone_idx(z) <= requested_highidx && + compaction_ready(zone, sc->order)) { + sc->compaction_ready = true; + continue; } + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2348,33 +2438,40 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; + if (nr_soft_reclaimed) + reclaimable = true; /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc); - } + if (shrink_zone(zone, sc)) + reclaimable = true; - return aborted_reclaim; -} - -/* All zones in zonelist are unreclaimable? */ -static bool all_unreclaimable(struct zonelist *zonelist, - struct scan_control *sc) -{ - struct zoneref *z; - struct zone *zone; + if (global_reclaim(sc) && + !reclaimable && zone_reclaimable(zone)) + reclaimable = true; + } - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - if (zone_reclaimable(zone)) - return false; + /* + * Don't shrink slabs when reclaiming memory from over limit cgroups + * but do shrink slab at least once when aborting reclaim for + * compaction to avoid unevenly scanning file/anon LRU pages over slab + * pages. + */ + if (global_reclaim(sc)) { + shrink_slab(&shrink, sc->nr_scanned, lru_pages); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } } - return true; + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; + + return reclaimable; } /* @@ -2394,15 +2491,11 @@ static bool all_unreclaimable(struct zonelist *zonelist, * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc, - struct shrink_control *shrink) + struct scan_control *sc) { unsigned long total_scanned = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct zoneref *z; - struct zone *zone; unsigned long writeback_threshold; - bool aborted_reclaim; + bool zones_reclaimable; delayacct_freepages_start(); @@ -2413,37 +2506,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); - - /* - * Don't shrink slabs when reclaiming memory from over limit - * cgroups but do shrink slab at least once when aborting - * reclaim for compaction to avoid unevenly scanning file/anon - * LRU pages over slab pages. - */ - if (global_reclaim(sc)) { - unsigned long lru_pages = 0; - - nodes_clear(shrink->nodes_to_scan); - for_each_zone_zonelist(zone, z, zonelist, - gfp_zone(sc->gfp_mask)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - node_set(zone_to_nid(zone), - shrink->nodes_to_scan); - } + zones_reclaimable = shrink_zones(zonelist, sc); - shrink_slab(shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) - goto out; + break; + + if (sc->compaction_ready) + break; /* * If we're getting trouble reclaiming, start doing @@ -2465,28 +2535,19 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - } while (--sc->priority >= 0 && !aborted_reclaim); + } while (--sc->priority >= 0); -out: delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; - /* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable - * check. - */ - if (oom_killer_disabled) - return 0; - /* Aborted reclaim to try compaction? don't OOM, then */ - if (aborted_reclaim) + if (sc->compaction_ready) return 1; - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) + /* Any of the zones still reclaimable? Don't OOM. */ + if (zones_reclaimable) return 1; return 0; @@ -2502,10 +2563,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; + if (!populated_zone(zone)) + continue; + pfmemalloc_reserve += min_wmark_pages(zone); free_pages += zone_page_state(zone, NR_FREE_PAGES); } + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ @@ -2530,9 +2598,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { + struct zoneref *z; struct zone *zone; - int high_zoneidx = gfp_zone(gfp_mask); - pg_data_t *pgdat; + pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly @@ -2551,10 +2619,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (fatal_signal_pending(current)) goto out; - /* Check if the pfmemalloc reserves are ok */ - first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); - pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_mask, nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) goto out; /* Account for the throttling */ @@ -2592,18 +2684,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .nodemask = nodemask, + .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, - .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .order = order, - .priority = DEF_PRIORITY, - .target_mem_cgroup = NULL, - .nodemask = nodemask, - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, }; /* @@ -2618,7 +2706,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, sc.may_writepage, gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); @@ -2633,16 +2721,14 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, unsigned long *nr_scanned) { struct scan_control sc = { - .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .order = 0, - .priority = 0, - .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + int swappiness = mem_cgroup_swappiness(memcg); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2658,7 +2744,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, &sc); + shrink_lruvec(lruvec, swappiness, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2674,19 +2760,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_reclaimed; int nid; struct scan_control sc = { - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .order = 0, - .priority = DEF_PRIORITY, - .target_mem_cgroup = memcg, - .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, }; /* @@ -2702,7 +2783,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.may_writepage, sc.gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2874,9 +2955,8 @@ static bool kswapd_shrink_zone(struct zone *zone, * high wmark plus a "gap" where the gap is either the low * watermark or 1% of the zone, whichever is smaller. */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); /* * If there is no low memory pressure or the zone is balanced then no @@ -2945,12 +3025,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .order = order, .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, - .may_writepage = !laptop_mode, - .order = order, - .target_mem_cgroup = NULL, }; count_vm_event(PAGEOUTRUN); @@ -3285,7 +3364,10 @@ static int kswapd(void *p) } } + tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + return 0; } @@ -3328,17 +3410,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct reclaim_state reclaim_state; struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, - .may_swap = 1, - .may_unmap = 1, + .priority = DEF_PRIORITY, .may_writepage = 1, - .nr_to_reclaim = nr_to_reclaim, + .may_unmap = 1, + .may_swap = 1, .hibernation_mode = 1, - .order = 0, - .priority = DEF_PRIORITY, - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -3349,7 +3427,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -3408,7 +3486,7 @@ int kswapd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold lock_memory_hotplug(). + * hold mem_hotplug_begin/end(). */ void kswapd_stop(int nid) { @@ -3518,13 +3596,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .may_swap = 1, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, diff --git a/mm/vmstat.c b/mm/vmstat.c index def5dd2fbe6..e9ab104b956 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -200,14 +200,16 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } } /* - * For use when we know that interrupts are disabled. + * For use when we know that interrupts are disabled, + * or when we know that preemption is disabled and that + * particular counter cannot be updated from interrupt context. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) @@ -489,7 +491,7 @@ static void refresh_cpu_vm_stats(void) continue; if (__this_cpu_read(p->pcp.count)) - drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); #endif } fold_diff(global_diff); @@ -761,6 +763,7 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", + "nr_pages_scanned", #ifdef CONFIG_NUMA "numa_hit", @@ -770,6 +773,9 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "workingset_refault", + "workingset_activate", + "workingset_nodereclaim", "nr_anon_transparent_hugepages", "nr_free_cma", "nr_dirty_threshold", @@ -810,6 +816,9 @@ const char * const vmstat_text[] = { "pgrotated", + "drop_pagecache", + "drop_slab", + #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", "numa_huge_pte_updates", @@ -860,6 +869,10 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ +#ifdef CONFIG_DEBUG_VM_VMACACHE + "vmacache_find_calls", + "vmacache_find_hits", +#endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ @@ -1055,7 +1068,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->pages_scanned, + zone_page_state(zone, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); @@ -1065,10 +1078,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); seq_printf(m, - "\n protection: (%lu", + "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_printf(m, ")" "\n pagesets"); @@ -1220,7 +1233,7 @@ int sysctl_stat_interval __read_mostly = HZ; static void vmstat_update(struct work_struct *w) { refresh_cpu_vm_stats(); - schedule_delayed_work(&__get_cpu_var(vmstat_work), + schedule_delayed_work(this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } @@ -1292,14 +1305,14 @@ static int __init setup_vmstat(void) #ifdef CONFIG_SMP int cpu; - register_cpu_notifier(&vmstat_notifier); + cpu_notifier_register_begin(); + __register_cpu_notifier(&vmstat_notifier); - get_online_cpus(); for_each_online_cpu(cpu) { start_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); } - put_online_cpus(); + cpu_notifier_register_done(); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); diff --git a/mm/workingset.c b/mm/workingset.c new file mode 100644 index 00000000000..f7216fa7da2 --- /dev/null +++ b/mm/workingset.c @@ -0,0 +1,414 @@ +/* + * Workingset detection + * + * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner + */ + +#include <linux/memcontrol.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/fs.h> +#include <linux/mm.h> + +/* + * Double CLOCK lists + * + * Per zone, two clock lists are maintained for file pages: the + * inactive and the active list. Freshly faulted pages start out at + * the head of the inactive list and page reclaim scans pages from the + * tail. Pages that are accessed multiple times on the inactive list + * are promoted to the active list, to protect them from reclaim, + * whereas active pages are demoted to the inactive list when the + * active list grows too big. + * + * fault ------------------------+ + * | + * +--------------+ | +-------------+ + * reclaim <- | inactive | <-+-- demotion | active | <--+ + * +--------------+ +-------------+ | + * | | + * +-------------- promotion ------------------+ + * + * + * Access frequency and refault distance + * + * A workload is thrashing when its pages are frequently used but they + * are evicted from the inactive list every time before another access + * would have promoted them to the active list. + * + * In cases where the average access distance between thrashing pages + * is bigger than the size of memory there is nothing that can be + * done - the thrashing set could never fit into memory under any + * circumstance. + * + * However, the average access distance could be bigger than the + * inactive list, yet smaller than the size of memory. In this case, + * the set could fit into memory if it weren't for the currently + * active pages - which may be used more, hopefully less frequently: + * + * +-memory available to cache-+ + * | | + * +-inactive------+-active----+ + * a b | c d e f g h i | J K L M N | + * +---------------+-----------+ + * + * It is prohibitively expensive to accurately track access frequency + * of pages. But a reasonable approximation can be made to measure + * thrashing on the inactive list, after which refaulting pages can be + * activated optimistically to compete with the existing active pages. + * + * Approximating inactive page access frequency - Observations: + * + * 1. When a page is accessed for the first time, it is added to the + * head of the inactive list, slides every existing inactive page + * towards the tail by one slot, and pushes the current tail page + * out of memory. + * + * 2. When a page is accessed for the second time, it is promoted to + * the active list, shrinking the inactive list by one slot. This + * also slides all inactive pages that were faulted into the cache + * more recently than the activated page towards the tail of the + * inactive list. + * + * Thus: + * + * 1. The sum of evictions and activations between any two points in + * time indicate the minimum number of inactive pages accessed in + * between. + * + * 2. Moving one inactive page N page slots towards the tail of the + * list requires at least N inactive page accesses. + * + * Combining these: + * + * 1. When a page is finally evicted from memory, the number of + * inactive pages accessed while the page was in cache is at least + * the number of page slots on the inactive list. + * + * 2. In addition, measuring the sum of evictions and activations (E) + * at the time of a page's eviction, and comparing it to another + * reading (R) at the time the page faults back into memory tells + * the minimum number of accesses while the page was not cached. + * This is called the refault distance. + * + * Because the first access of the page was the fault and the second + * access the refault, we combine the in-cache distance with the + * out-of-cache distance to get the complete minimum access distance + * of this page: + * + * NR_inactive + (R - E) + * + * And knowing the minimum access distance of a page, we can easily + * tell if the page would be able to stay in cache assuming all page + * slots in the cache were available: + * + * NR_inactive + (R - E) <= NR_inactive + NR_active + * + * which can be further simplified to + * + * (R - E) <= NR_active + * + * Put into words, the refault distance (out-of-cache) can be seen as + * a deficit in inactive list space (in-cache). If the inactive list + * had (R - E) more page slots, the page would not have been evicted + * in between accesses, but activated instead. And on a full system, + * the only thing eating into inactive list space is active pages. + * + * + * Activating refaulting pages + * + * All that is known about the active list is that the pages have been + * accessed more than once in the past. This means that at any given + * time there is actually a good chance that pages on the active list + * are no longer in active use. + * + * So when a refault distance of (R - E) is observed and there are at + * least (R - E) active pages, the refaulting page is activated + * optimistically in the hope that (R - E) active pages are actually + * used less frequently than the refaulting page - or even not used at + * all anymore. + * + * If this is wrong and demotion kicks in, the pages which are truly + * used more frequently will be reactivated while the less frequently + * used once will be evicted from memory. + * + * But if this is right, the stale pages will be pushed out of memory + * and the used pages get to stay in cache. + * + * + * Implementation + * + * For each zone's file LRU lists, a counter for inactive evictions + * and activations is maintained (zone->inactive_age). + * + * On eviction, a snapshot of this counter (along with some bits to + * identify the zone) is stored in the now empty page cache radix tree + * slot of the evicted page. This is called a shadow entry. + * + * On cache misses for which there are shadow entries, an eligible + * refault distance will immediately activate the refaulting page. + */ + +static void *pack_shadow(unsigned long eviction, struct zone *zone) +{ + eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); + eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); + eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + + return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); +} + +static void unpack_shadow(void *shadow, + struct zone **zone, + unsigned long *distance) +{ + unsigned long entry = (unsigned long)shadow; + unsigned long eviction; + unsigned long refault; + unsigned long mask; + int zid, nid; + + entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + zid = entry & ((1UL << ZONES_SHIFT) - 1); + entry >>= ZONES_SHIFT; + nid = entry & ((1UL << NODES_SHIFT) - 1); + entry >>= NODES_SHIFT; + eviction = entry; + + *zone = NODE_DATA(nid)->node_zones + zid; + + refault = atomic_long_read(&(*zone)->inactive_age); + mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + + RADIX_TREE_EXCEPTIONAL_SHIFT); + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + *distance = (refault - eviction) & mask; +} + +/** + * workingset_eviction - note the eviction of a page from memory + * @mapping: address space the page was backing + * @page: the page being evicted + * + * Returns a shadow entry to be stored in @mapping->page_tree in place + * of the evicted @page so that a later refault can be detected. + */ +void *workingset_eviction(struct address_space *mapping, struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long eviction; + + eviction = atomic_long_inc_return(&zone->inactive_age); + return pack_shadow(eviction, zone); +} + +/** + * workingset_refault - evaluate the refault of a previously evicted page + * @shadow: shadow entry of the evicted page + * + * Calculates and evaluates the refault distance of the previously + * evicted page in the context of the zone it was allocated in. + * + * Returns %true if the page should be activated, %false otherwise. + */ +bool workingset_refault(void *shadow) +{ + unsigned long refault_distance; + struct zone *zone; + + unpack_shadow(shadow, &zone, &refault_distance); + inc_zone_state(zone, WORKINGSET_REFAULT); + + if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + inc_zone_state(zone, WORKINGSET_ACTIVATE); + return true; + } + return false; +} + +/** + * workingset_activation - note a page activation + * @page: page that is being activated + */ +void workingset_activation(struct page *page) +{ + atomic_long_inc(&page_zone(page)->inactive_age); +} + +/* + * Shadow entries reflect the share of the working set that does not + * fit into memory, so their number depends on the access pattern of + * the workload. In most cases, they will refault or get reclaimed + * along with the inode, but a (malicious) workload that streams + * through files with a total size several times that of available + * memory, while preventing the inodes from being reclaimed, can + * create excessive amounts of shadow nodes. To keep a lid on this, + * track shadow nodes and reclaim them when they grow way past the + * point where they would still be useful. + */ + +struct list_lru workingset_shadow_nodes; + +static unsigned long count_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long shadow_nodes; + unsigned long max_nodes; + unsigned long pages; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); + local_irq_enable(); + + pages = node_present_pages(sc->nid); + /* + * Active cache pages are limited to 50% of memory, and shadow + * entries that represent a refault distance bigger than that + * do not have any effect. Limit the number of shadow nodes + * such that shadow entries do not exceed the number of active + * cache pages, assuming a worst-case node population density + * of 1/8th on average. + * + * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * each, this will reclaim shadow entries when they consume + * ~2% of available memory: + * + * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + */ + max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + + if (shadow_nodes <= max_nodes) + return 0; + + return shadow_nodes - max_nodes; +} + +static enum lru_status shadow_lru_isolate(struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct address_space *mapping; + struct radix_tree_node *node; + unsigned int i; + int ret; + + /* + * Page cache insertions and deletions synchroneously maintain + * the shadow node LRU under the mapping->tree_lock and the + * lru_lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the lru_lock pins any + * address_space that has radix tree nodes on the LRU. + * + * We can then safely transition to the mapping->tree_lock to + * pin only the address_space of the particular node we want + * to reclaim, take the node off-LRU, and drop the lru_lock. + */ + + node = container_of(item, struct radix_tree_node, private_list); + mapping = node->private_data; + + /* Coming from the list, invert the lock order */ + if (!spin_trylock(&mapping->tree_lock)) { + spin_unlock(lru_lock); + ret = LRU_RETRY; + goto out; + } + + list_del_init(item); + spin_unlock(lru_lock); + + /* + * The nodes should only contain one or more shadow entries, + * no pages, so we expect to be able to remove them all and + * delete and free the empty node afterwards. + */ + + BUG_ON(!node->count); + BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[i]) { + BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + node->slots[i] = NULL; + BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + BUG_ON(!mapping->nrshadows); + mapping->nrshadows--; + } + } + BUG_ON(node->count); + inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); + if (!__radix_tree_delete_node(&mapping->page_tree, node)) + BUG(); + + spin_unlock(&mapping->tree_lock); + ret = LRU_REMOVED_RETRY; +out: + local_irq_enable(); + cond_resched(); + local_irq_disable(); + spin_lock(lru_lock); + return ret; +} + +static unsigned long scan_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long ret; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, + shadow_lru_isolate, NULL, &sc->nr_to_scan); + local_irq_enable(); + return ret; +} + +static struct shrinker workingset_shadow_shrinker = { + .count_objects = count_shadow_nodes, + .scan_objects = scan_shadow_nodes, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + +/* + * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe + * mapping->tree_lock. + */ +static struct lock_class_key shadow_nodes_key; + +static int __init workingset_init(void) +{ + int ret; + + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + if (ret) + goto err; + ret = register_shrinker(&workingset_shadow_shrinker); + if (ret) + goto err_list_lru; + return 0; +err_list_lru: + list_lru_destroy(&workingset_shadow_nodes); +err: + return ret; +} +module_init(workingset_init); diff --git a/mm/zbud.c b/mm/zbud.c index 9451361e6aa..f26e7fcc7fa 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -51,6 +51,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zbud.h> +#include <linux/zpool.h> /***************** * Structures @@ -113,6 +114,91 @@ struct zbud_header { }; /***************** + * zpool + ****************/ + +#ifdef CONFIG_ZPOOL + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + return zpool_evict(pool, handle); +} + +static struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zbud_create_pool(gfp, &zbud_zpool_ops); +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zbud"); +#endif /* CONFIG_ZPOOL */ + +/***************** * Helpers *****************/ /* Just to make the code easier to read */ @@ -122,7 +208,7 @@ enum buddy { }; /* Converts an allocation size in bytes to size in zbud chunks */ -static int size_to_chunks(int size) +static int size_to_chunks(size_t size) { return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; } @@ -247,7 +333,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; @@ -255,7 +341,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, enum buddy bud; struct page *page; - if (size <= 0 || gfp & __GFP_HIGHMEM) + if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) return -ENOSPC; @@ -511,11 +597,20 @@ static int __init init_zbud(void) /* Make sure the zbud header will fit in one chunk */ BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); pr_info("loaded\n"); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zbud_zpool_driver); +#endif + return 0; } static void __exit exit_zbud(void) { +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zbud_zpool_driver); +#endif + pr_info("unloaded\n"); } diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 00000000000..739cdf0d183 --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,364 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/zpool.h> + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/** + * zpool_evict() - evict callback from a zpool implementation. + * @pool: pool to evict from. + * @handle: handle to evict. + * + * This can be used by zpool implementations to call the + * user's evict zpool_ops struct evict callback. + */ +int zpool_evict(void *pool, unsigned long handle) +{ + struct zpool *zpool; + + spin_lock(&pools_lock); + list_for_each_entry(zpool, &pools_head, list) { + if (zpool->pool == pool) { + spin_unlock(&pools_lock); + if (!zpool->ops || !zpool->ops->evict) + return -EINVAL; + return zpool->ops->evict(zpool, handle); + } + } + spin_unlock(&pools_lock); + + return -ENOENT; +} +EXPORT_SYMBOL(zpool_evict); + +static struct zpool_driver *zpool_get_driver(char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_info("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(gfp, ops); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_info("created %s pool\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_info("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +static int __init init_zpool(void) +{ + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zpool(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zpool); +module_exit(exit_zpool); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c03ca5e9fe1..94f38fac5e8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -92,6 +92,7 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/zsmalloc.h> +#include <linux/zpool.h> /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -141,7 +142,7 @@ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* - * On systems with 4K page size, this gives 254 size classes! There is a + * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes @@ -240,6 +241,82 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zs_create_pool(gfp); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) +{ + zs_unmap_object(pool, handle); +} + +static u64 zs_zpool_total_size(void *pool) +{ + return zs_get_total_size_bytes(pool); +} + +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zsmalloc"); +#endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -690,7 +767,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); area->vm_addr = area->vm->addr; return area->vm_addr + off; } @@ -814,21 +891,40 @@ static void zs_exit(void) { int cpu; +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - unregister_cpu_notifier(&zs_cpu_nb); + __unregister_cpu_notifier(&zs_cpu_nb); + + cpu_notifier_register_done(); } static int zs_init(void) { int cpu, ret; - register_cpu_notifier(&zs_cpu_nb); + cpu_notifier_register_begin(); + + __register_cpu_notifier(&zs_cpu_nb); for_each_online_cpu(cpu) { ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) + if (notifier_to_errno(ret)) { + cpu_notifier_register_done(); goto fail; + } } + + cpu_notifier_register_done(); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + return 0; fail: zs_exit(); @@ -1071,7 +1167,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) class = &pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); - area = &__get_cpu_var(zs_map_area); + area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) kunmap_atomic(area->vm_addr); else { diff --git a/mm/zswap.c b/mm/zswap.c index e55bab9dc41..ea064c1a09b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -34,7 +34,7 @@ #include <linux/swap.h> #include <linux/crypto.h> #include <linux/mempool.h> -#include <linux/zbud.h> +#include <linux/zpool.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -45,8 +45,8 @@ /********************************* * statistics **********************************/ -/* Number of memory pages used by the compressed pool */ -static u64 zswap_pool_pages; +/* Total bytes used by the compressed storage */ +static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); @@ -89,6 +89,14 @@ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); +/* Compressed storage to use */ +#define ZSWAP_ZPOOL_DEFAULT "zbud" +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +module_param_named(zpool, zswap_zpool_type, charp, 0444); + +/* zpool is shared by all of zswap backend */ +static struct zpool *zswap_pool; + /********************************* * compression functions **********************************/ @@ -160,14 +168,14 @@ static void zswap_comp_exit(void) * rbnode - links the entry into red-black tree for the appropriate swap type * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code - * concurent calls to load, invalidate, and writeback. The lock + * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zsmalloc allocation handle that stores the compressed page data + * handle - zpool allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression */ struct zswap_entry { struct rb_node rbnode; @@ -189,7 +197,6 @@ struct zswap_header { struct zswap_tree { struct rb_root rbroot; spinlock_t lock; - struct zbud_pool *pool; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; @@ -202,10 +209,10 @@ static struct kmem_cache *zswap_entry_cache; static int zswap_entry_cache_create(void) { zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); - return (zswap_entry_cache == NULL); + return zswap_entry_cache == NULL; } -static void zswap_entry_cache_destory(void) +static void __init zswap_entry_cache_destroy(void) { kmem_cache_destroy(zswap_entry_cache); } @@ -282,16 +289,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zsmalloc allocation, + * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ -static void zswap_free_entry(struct zswap_tree *tree, - struct zswap_entry *entry) +static void zswap_free_entry(struct zswap_entry *entry) { - zbud_free(tree->pool, entry->handle); + zpool_free(zswap_pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); } /* caller must hold the tree lock */ @@ -311,7 +317,7 @@ static void zswap_entry_put(struct zswap_tree *tree, BUG_ON(refcount < 0); if (refcount == 0) { zswap_rb_erase(&tree->rbroot, entry); - zswap_free_entry(tree, entry); + zswap_free_entry(entry); } } @@ -346,7 +352,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) return NOTIFY_BAD; } *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; - dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) { pr_err("can't allocate compressor buffer\n"); crypto_free_comp(tfm); @@ -387,18 +393,18 @@ static int zswap_cpu_init(void) { unsigned long cpu; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) goto cleanup; - register_cpu_notifier(&zswap_cpu_notifier_block); - put_online_cpus(); + __register_cpu_notifier(&zswap_cpu_notifier_block); + cpu_notifier_register_done(); return 0; cleanup: for_each_online_cpu(cpu) __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); - put_online_cpus(); + cpu_notifier_register_done(); return -ENOMEM; } @@ -407,8 +413,8 @@ cleanup: **********************************/ static bool zswap_is_full(void) { - return (totalram_pages * zswap_max_pool_percent / 100 < - zswap_pool_pages); + return totalram_pages * zswap_max_pool_percent / 100 < + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } /********************************* @@ -501,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry, NULL); + swapcache_free(entry); } while (err != -ENOMEM); if (new_page) @@ -524,7 +530,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * the swap cache, the compressed version stored by zswap can be * freed. */ -static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) { struct zswap_header *zhdr; swp_entry_t swpentry; @@ -540,12 +546,11 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) }; /* extract swpentry from data */ - zhdr = zbud_map(pool, handle); + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ - zbud_unmap(pool, handle); + zpool_unmap_handle(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); - BUG_ON(pool != tree->pool); /* find and ref zswap entry */ spin_lock(&tree->lock); @@ -573,13 +578,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(tree->pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(tree->pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -652,7 +657,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zbud_reclaim_page(tree->pool, 8)) { + if (zpool_shrink(zswap_pool, 1, NULL)) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -679,7 +684,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); - ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, + ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; @@ -689,11 +694,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto freepage; } - zhdr = zbud_map(tree->pool, handle); + zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zbud_unmap(tree->pool, handle); + zpool_unmap_handle(zswap_pool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -716,7 +721,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); return 0; @@ -752,13 +757,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(tree->pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(tree->pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -804,16 +809,14 @@ static void zswap_frontswap_invalidate_area(unsigned type) /* walk the tree and free everything */ spin_lock(&tree->lock); rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(tree, entry); + zswap_free_entry(entry); tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); - - zbud_destroy_pool(tree->pool); kfree(tree); zswap_trees[type] = NULL; } -static struct zbud_ops zswap_zbud_ops = { +static struct zpool_ops zswap_zpool_ops = { .evict = zswap_writeback_entry }; @@ -822,20 +825,14 @@ static void zswap_frontswap_init(unsigned type) struct zswap_tree *tree; tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); - if (!tree) - goto err; - tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); - if (!tree->pool) - goto freetree; + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; - return; - -freetree: - kfree(tree); -err: - pr_err("alloc failed, zswap disabled for swap type %d\n", type); } static struct frontswap_ops zswap_frontswap_ops = { @@ -877,8 +874,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", S_IRUGO, zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_u64("pool_pages", S_IRUGO, - zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_u64("pool_total_size", S_IRUGO, + zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); @@ -903,13 +900,30 @@ static void __exit zswap_debugfs_exit(void) { } **********************************/ static int __init init_zswap(void) { + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + if (!zswap_enabled) return 0; pr_info("loading zswap\n"); + + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); + if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_info("%s zpool not available\n", zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, + &zswap_zpool_ops); + } + if (!zswap_pool) { + pr_err("%s zpool not available\n", zswap_zpool_type); + pr_err("zpool creation failed\n"); + goto error; + } + pr_info("using %s pool\n", zswap_zpool_type); + if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); - goto error; + goto cachefail; } if (zswap_comp_init()) { pr_err("compressor initialization failed\n"); @@ -919,6 +933,7 @@ static int __init init_zswap(void) pr_err("per-cpu initialization failed\n"); goto pcpufail; } + frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); @@ -926,7 +941,9 @@ static int __init init_zswap(void) pcpufail: zswap_comp_exit(); compfail: - zswap_entry_cache_destory(); + zswap_entry_cache_destroy(); +cachefail: + zpool_destroy_pool(zswap_pool); error: return -ENOMEM; } |