summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/Makefile12
-rw-r--r--mm/bootmem.c134
-rw-r--r--mm/cleancache.c6
-rw-r--r--mm/compaction.c547
-rw-r--r--mm/filemap.c108
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/huge_memory.c29
-rw-r--r--mm/hugetlb.c39
-rw-r--r--mm/internal.h49
-rw-r--r--mm/madvise.c15
-rw-r--r--mm/memblock.c49
-rw-r--r--mm/memcontrol.c787
-rw-r--r--mm/memory-failure.c10
-rw-r--r--mm/memory.c49
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/mempolicy.c92
-rw-r--r--mm/migrate.c20
-rw-r--r--mm/mmap.c161
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mremap.c26
-rw-r--r--mm/nobootmem.c113
-rw-r--r--mm/nommu.c20
-rw-r--r--mm/oom_kill.c48
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c498
-rw-r--r--mm/page_isolation.c15
-rw-r--r--mm/percpu.c22
-rw-r--r--mm/pgtable-generic.c4
-rw-r--r--mm/process_vm_access.c16
-rw-r--r--mm/readahead.c40
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/shmem.c543
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swap.c129
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c33
-rw-r--r--mm/thrash.c155
-rw-r--r--mm/truncate.c25
-rw-r--r--mm/util.c30
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmscan.c754
-rw-r--r--mm/vmstat.c17
44 files changed, 2576 insertions, 2114 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e338407f122..b2176374b98 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -198,7 +198,7 @@ config COMPACTION
config MIGRATION
bool "Page migration"
def_bool y
- depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION
+ depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful in
@@ -349,6 +349,16 @@ choice
benefit.
endchoice
+config CROSS_MEMORY_ATTACH
+ bool "Cross Memory Support"
+ depends on MMU
+ default y
+ help
+ Enabling this option adds the system calls process_vm_readv and
+ process_vm_writev which allow a process with the correct privileges
+ to directly read from or write to to another process's address space.
+ See the man page for more details.
+
#
# UP and nommu archs use km based percpu allocator
#
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00ef2a0..a156285ce88 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,15 +5,18 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o pgtable-generic.o \
- process_vm_access.o
+ vmalloc.o pagewalk.o pgtable-generic.o
+
+ifdef CONFIG_CROSS_MEMORY_ATTACH
+mmu-$(CONFIG_MMU) += process_vm_access.o
+endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o mmu_context.o percpu.o \
- $(mmu-y)
+ compaction.o $(mmu-y)
obj-y += init-mm.o
ifdef CONFIG_NO_BOOTMEM
@@ -25,14 +28,13 @@ endif
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_BOUNCE) += bounce.o
-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
-obj-$(CONFIG_COMPACTION) += compaction.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0131170c9d5..ec4fcb7a56c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages)
*/
static void __init link_bootmem(bootmem_data_t *bdata)
{
- struct list_head *iter;
+ bootmem_data_t *ent;
- list_for_each(iter, &bdata_list) {
- bootmem_data_t *ent;
-
- ent = list_entry(iter, bootmem_data_t, list);
- if (bdata->node_min_pfn < ent->node_min_pfn)
- break;
+ list_for_each_entry(ent, &bdata_list, list) {
+ if (bdata->node_min_pfn < ent->node_min_pfn) {
+ list_add_tail(&bdata->list, &ent->list);
+ return;
+ }
}
- list_add_tail(&bdata->list, iter);
+
+ list_add_tail(&bdata->list, &bdata_list);
}
/*
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
} else {
unsigned long off = 0;
- while (vec && off < BITS_PER_LONG) {
+ vec >>= start & (BITS_PER_LONG - 1);
+ while (vec) {
if (vec & 1) {
page = pfn_to_page(start + off);
__free_pages_bootmem(page, 0);
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
return ALIGN(base + off, align) - base;
}
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
p_bdata = bootmem_arch_preferred_node(bdata, size, align,
goal, limit);
if (p_bdata)
- return alloc_bootmem_core(p_bdata, size, align,
+ return alloc_bootmem_bdata(p_bdata, size, align,
goal, limit);
}
#endif
return NULL;
}
-static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+static void * __init alloc_bootmem_core(unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit)
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
bootmem_data_t *bdata;
void *region;
-restart:
region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
if (region)
return region;
@@ -614,11 +614,25 @@ restart:
if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
break;
- region = alloc_bootmem_core(bdata, size, align, goal, limit);
+ region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
if (region)
return region;
}
+ return NULL;
+}
+
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+restart:
+ ptr = alloc_bootmem_core(size, align, goal, limit);
+ if (ptr)
+ return ptr;
if (goal) {
goal = 0;
goto restart;
@@ -684,21 +698,56 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, limit);
}
-static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
+static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
void *ptr;
- ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+again:
+ ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
+ align, goal, limit);
if (ptr)
return ptr;
- ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
+ ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
if (ptr)
return ptr;
- return ___alloc_bootmem(size, align, goal, limit);
+ ptr = alloc_bootmem_core(size, align, goal, limit);
+ if (ptr)
+ return ptr;
+
+ if (goal) {
+ goal = 0;
+ goto again;
+ }
+
+ return NULL;
+}
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ if (ptr)
+ return ptr;
+
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
}
/**
@@ -722,7 +771,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+ return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
}
void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -743,7 +792,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
unsigned long new_goal;
new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
- ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+ ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
new_goal, 0);
if (ptr)
return ptr;
@@ -754,47 +803,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
}
-#ifdef CONFIG_SPARSEMEM
-/**
- * alloc_bootmem_section - allocate boot memory from a specific section
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
- unsigned long section_nr)
-{
- bootmem_data_t *bdata;
- unsigned long pfn, goal;
-
- pfn = section_nr_to_pfn(section_nr);
- goal = pfn << PAGE_SHIFT;
- bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-
- return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
-}
-#endif
-
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
- void *ptr;
-
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
- ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
- if (ptr)
- return ptr;
-
- ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
- if (ptr)
- return ptr;
-
- return __alloc_bootmem_nopanic(size, align, goal);
-}
-
#ifndef ARCH_LOW_ADDRESS_LIMIT
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
#endif
@@ -839,6 +847,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node(pgdat->bdata, size, align,
- goal, ARCH_LOW_ADDRESS_LIMIT);
+ return ___alloc_bootmem_node(pgdat, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
}
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5646c740f61..32e6f4136fa 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs);
static int cleancache_get_key(struct inode *inode,
struct cleancache_filekey *key)
{
- int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+ int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
int len = 0, maxlen = CLEANCACHE_KEY_MAX;
struct super_block *sb = inode->i_sb;
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode,
if (sb->s_export_op != NULL) {
fhfn = sb->s_export_op->encode_fh;
if (fhfn) {
- struct dentry d;
- d.d_inode = inode;
- len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+ len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
if (len <= 0 || len == 255)
return -1;
if (maxlen > CLEANCACHE_KEY_MAX)
diff --git a/mm/compaction.c b/mm/compaction.c
index 74a8c825ff2..4ac338af512 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,30 +16,11 @@
#include <linux/sysfs.h>
#include "internal.h"
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
-/*
- * compact_control is used to track pages being migrated and the free pages
- * they are being migrated to during memory compaction. The free_pfn starts
- * at the end of a zone and migrate_pfn begins at the start. Movable pages
- * are moved to the end of a zone during a compaction run and the run
- * completes when free_pfn <= migrate_pfn
- */
-struct compact_control {
- struct list_head freepages; /* List of free pages to migrate to */
- struct list_head migratepages; /* List of pages being migrated */
- unsigned long nr_freepages; /* Number of isolated free pages */
- unsigned long nr_migratepages; /* Number of pages to migrate */
- unsigned long free_pfn; /* isolate_freepages search base */
- unsigned long migrate_pfn; /* isolate_migratepages search base */
- bool sync; /* Synchronous migration */
-
- int order; /* order a direct compactor needs */
- int migratetype; /* MOVABLE, RECLAIMABLE etc */
- struct zone *zone;
-};
-
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
@@ -54,24 +35,35 @@ static unsigned long release_freepages(struct list_head *freelist)
return count;
}
-/* Isolate free pages onto a private freelist. Must hold zone->lock */
-static unsigned long isolate_freepages_block(struct zone *zone,
- unsigned long blockpfn,
- struct list_head *freelist)
+static void map_pages(struct list_head *list)
+{
+ struct page *page;
+
+ list_for_each_entry(page, list, lru) {
+ arch_alloc_page(page, 0);
+ kernel_map_pages(page, 1, 1);
+ }
+}
+
+static inline bool migrate_async_suitable(int migratetype)
+{
+ return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
+}
+
+/*
+ * Isolate free pages onto a private freelist. Caller must hold zone->lock.
+ * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
+ * pages inside of the pageblock (even though it may still end up isolating
+ * some pages).
+ */
+static unsigned long isolate_freepages_block(unsigned long blockpfn,
+ unsigned long end_pfn,
+ struct list_head *freelist,
+ bool strict)
{
- unsigned long zone_end_pfn, end_pfn;
int nr_scanned = 0, total_isolated = 0;
struct page *cursor;
- /* Get the last PFN we should scan for free pages at */
- zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
- end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
-
- /* Find the first usable PFN in the block to initialse page cursor */
- for (; blockpfn < end_pfn; blockpfn++) {
- if (pfn_valid_within(blockpfn))
- break;
- }
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. This assumes the block is valid */
@@ -79,15 +71,23 @@ static unsigned long isolate_freepages_block(struct zone *zone,
int isolated, i;
struct page *page = cursor;
- if (!pfn_valid_within(blockpfn))
+ if (!pfn_valid_within(blockpfn)) {
+ if (strict)
+ return 0;
continue;
+ }
nr_scanned++;
- if (!PageBuddy(page))
+ if (!PageBuddy(page)) {
+ if (strict)
+ return 0;
continue;
+ }
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
+ if (!isolated && strict)
+ return 0;
total_isolated += isolated;
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
@@ -105,114 +105,71 @@ static unsigned long isolate_freepages_block(struct zone *zone,
return total_isolated;
}
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-
- int migratetype = get_pageblock_migratetype(page);
-
- /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
- if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
- return false;
-
- /* If the page is a large free page, then allow migration */
- if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return true;
-
- /* If the block is MIGRATE_MOVABLE, allow migration */
- if (migratetype == MIGRATE_MOVABLE)
- return true;
-
- /* Otherwise skip the block */
- return false;
-}
-
-/*
- * Based on information in the current compact_control, find blocks
- * suitable for isolating free pages from and then isolate them.
+/**
+ * isolate_freepages_range() - isolate free pages.
+ * @start_pfn: The first PFN to start isolating.
+ * @end_pfn: The one-past-last PFN.
+ *
+ * Non-free pages, invalid PFNs, or zone boundaries within the
+ * [start_pfn, end_pfn) range are considered errors, cause function to
+ * undo its actions and return zero.
+ *
+ * Otherwise, function returns one-past-the-last PFN of isolated page
+ * (which may be greater then end_pfn if end fell in a middle of
+ * a free page).
*/
-static void isolate_freepages(struct zone *zone,
- struct compact_control *cc)
+unsigned long
+isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
{
- struct page *page;
- unsigned long high_pfn, low_pfn, pfn;
- unsigned long flags;
- int nr_freepages = cc->nr_freepages;
- struct list_head *freelist = &cc->freepages;
-
- /*
- * Initialise the free scanner. The starting point is where we last
- * scanned from (or the end of the zone if starting). The low point
- * is the end of the pageblock the migration scanner is using.
- */
- pfn = cc->free_pfn;
- low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+ unsigned long isolated, pfn, block_end_pfn, flags;
+ struct zone *zone = NULL;
+ LIST_HEAD(freelist);
- /*
- * Take care that if the migration scanner is at the end of the zone
- * that the free scanner does not accidentally move to the next zone
- * in the next isolation cycle.
- */
- high_pfn = min(low_pfn, pfn);
+ if (pfn_valid(start_pfn))
+ zone = page_zone(pfn_to_page(start_pfn));
- /*
- * Isolate free pages until enough are available to migrate the
- * pages on cc->migratepages. We stop searching if the migrate
- * and free page scanners meet or enough free pages are isolated.
- */
- for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
- pfn -= pageblock_nr_pages) {
- unsigned long isolated;
-
- if (!pfn_valid(pfn))
- continue;
+ for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
+ if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
+ break;
/*
- * Check for overlapping nodes/zones. It's possible on some
- * configurations to have a setup like
- * node0 node1 node0
- * i.e. it's possible that all pages within a zones range of
- * pages do not belong to a single zone.
+ * On subsequent iterations ALIGN() is actually not needed,
+ * but we keep it that we not to complicate the code.
*/
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
- continue;
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
- /* Check the block is suitable for migration */
- if (!suitable_migration_target(page))
- continue;
+ spin_lock_irqsave(&zone->lock, flags);
+ isolated = isolate_freepages_block(pfn, block_end_pfn,
+ &freelist, true);
+ spin_unlock_irqrestore(&zone->lock, flags);
/*
- * Found a block suitable for isolating free pages from. Now
- * we disabled interrupts, double check things are ok and
- * isolate the pages. This is to minimise the time IRQs
- * are disabled
+ * In strict mode, isolate_freepages_block() returns 0 if
+ * there are any holes in the block (ie. invalid PFNs or
+ * non-free pages).
*/
- isolated = 0;
- spin_lock_irqsave(&zone->lock, flags);
- if (suitable_migration_target(page)) {
- isolated = isolate_freepages_block(zone, pfn, freelist);
- nr_freepages += isolated;
- }
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (!isolated)
+ break;
/*
- * Record the highest PFN we isolated pages from. When next
- * looking for free pages, the search will restart here as
- * page migration may have returned some pages to the allocator
+ * If we managed to isolate pages, it is always (1 << n) *
+ * pageblock_nr_pages for some non-negative n. (Max order
+ * page may span two pageblocks).
*/
- if (isolated)
- high_pfn = max(high_pfn, pfn);
}
/* split_free_page does not map the pages */
- list_for_each_entry(page, freelist, lru) {
- arch_alloc_page(page, 0);
- kernel_map_pages(page, 1, 1);
+ map_pages(&freelist);
+
+ if (pfn < end_pfn) {
+ /* Loop terminated early, cleanup. */
+ release_freepages(&freelist);
+ return 0;
}
- cc->free_pfn = high_pfn;
- cc->nr_freepages = nr_freepages;
+ /* We don't use freelists for anything. */
+ return pfn;
}
/* Update the number of anon and file isolated pages in the zone */
@@ -243,37 +200,34 @@ static bool too_many_isolated(struct zone *zone)
return isolated > (inactive + active) / 2;
}
-/* possible outcome of isolate_migratepages */
-typedef enum {
- ISOLATE_ABORT, /* Abort compaction now */
- ISOLATE_NONE, /* No pages isolated, continue scanning */
- ISOLATE_SUCCESS, /* Pages isolated, migrate */
-} isolate_migrate_t;
-
-/*
- * Isolate all pages that can be migrated from the block pointed to by
- * the migrate scanner within compact_control.
+/**
+ * isolate_migratepages_range() - isolate all migrate-able pages in range.
+ * @zone: Zone pages are in.
+ * @cc: Compaction control structure.
+ * @low_pfn: The first PFN of the range.
+ * @end_pfn: The one-past-the-last PFN of the range.
+ *
+ * Isolate all pages that can be migrated from the range specified by
+ * [low_pfn, end_pfn). Returns zero if there is a fatal signal
+ * pending), otherwise PFN of the first page that was not scanned
+ * (which may be both less, equal to or more then end_pfn).
+ *
+ * Assumes that cc->migratepages is empty and cc->nr_migratepages is
+ * zero.
+ *
+ * Apart from cc->migratepages and cc->nr_migratetypes this function
+ * does not modify any cc's fields, in particular it does not modify
+ * (or read for that matter) cc->migrate_pfn.
*/
-static isolate_migrate_t isolate_migratepages(struct zone *zone,
- struct compact_control *cc)
+unsigned long
+isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ unsigned long low_pfn, unsigned long end_pfn)
{
- unsigned long low_pfn, end_pfn;
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
- isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
-
- /* Do not scan outside zone boundaries */
- low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
-
- /* Only scan within a pageblock boundary */
- end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
-
- /* Do not cross the free scanner or scan within a memory hole */
- if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
- cc->migrate_pfn = end_pfn;
- return ISOLATE_NONE;
- }
+ isolate_mode_t mode = 0;
+ struct lruvec *lruvec;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -282,13 +236,13 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
- if (!cc->sync)
- return ISOLATE_ABORT;
+ if (cc->mode != COMPACT_SYNC)
+ return 0;
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
- return ISOLATE_ABORT;
+ return 0;
}
/* Time to isolate some pages for migration */
@@ -350,8 +304,9 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* satisfies the allocation
*/
pageblock_nr = low_pfn >> pageblock_order;
- if (!cc->sync && last_pageblock_nr != pageblock_nr &&
- get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+ if (cc->mode != COMPACT_SYNC &&
+ last_pageblock_nr != pageblock_nr &&
+ !migrate_async_suitable(get_pageblock_migratetype(page))) {
low_pfn += pageblock_nr_pages;
low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
last_pageblock_nr = pageblock_nr;
@@ -371,17 +326,19 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
}
- if (!cc->sync)
+ if (cc->mode != COMPACT_SYNC)
mode |= ISOLATE_ASYNC_MIGRATE;
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
/* Try isolate the page */
- if (__isolate_lru_page(page, mode, 0) != 0)
+ if (__isolate_lru_page(page, mode) != 0)
continue;
VM_BUG_ON(PageTransCompound(page));
/* Successfully isolated */
- del_page_from_lru_list(zone, page, page_lru(page));
+ del_page_from_lru_list(page, lruvec, page_lru(page));
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
@@ -396,11 +353,200 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
acct_isolated(zone, cc);
spin_unlock_irq(&zone->lru_lock);
- cc->migrate_pfn = low_pfn;
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
- return ISOLATE_SUCCESS;
+ return low_pfn;
+}
+
+#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+#ifdef CONFIG_COMPACTION
+/*
+ * Returns true if MIGRATE_UNMOVABLE pageblock was successfully
+ * converted to MIGRATE_MOVABLE type, false otherwise.
+ */
+static bool rescue_unmovable_pageblock(struct page *page)
+{
+ unsigned long pfn, start_pfn, end_pfn;
+ struct page *start_page, *end_page;
+
+ pfn = page_to_pfn(page);
+ start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ end_pfn = start_pfn + pageblock_nr_pages;
+
+ start_page = pfn_to_page(start_pfn);
+ end_page = pfn_to_page(end_pfn);
+
+ /* Do not deal with pageblocks that overlap zones */
+ if (page_zone(start_page) != page_zone(end_page))
+ return false;
+
+ for (page = start_page, pfn = start_pfn; page < end_page; pfn++,
+ page++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ if (PageBuddy(page)) {
+ int order = page_order(page);
+
+ pfn += (1 << order) - 1;
+ page += (1 << order) - 1;
+
+ continue;
+ } else if (page_count(page) == 0 || PageLRU(page))
+ continue;
+
+ return false;
+ }
+
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE);
+ return true;
+}
+
+enum smt_result {
+ GOOD_AS_MIGRATION_TARGET,
+ FAIL_UNMOVABLE_TARGET,
+ FAIL_BAD_TARGET,
+};
+
+/*
+ * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block
+ * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page
+ * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise.
+ */
+static enum smt_result suitable_migration_target(struct page *page,
+ struct compact_control *cc)
+{
+
+ int migratetype = get_pageblock_migratetype(page);
+
+ /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+ return FAIL_BAD_TARGET;
+
+ /* If the page is a large free page, then allow migration */
+ if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ return GOOD_AS_MIGRATION_TARGET;
+
+ /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+ if (cc->mode != COMPACT_ASYNC_UNMOVABLE &&
+ migrate_async_suitable(migratetype))
+ return GOOD_AS_MIGRATION_TARGET;
+
+ if (cc->mode == COMPACT_ASYNC_MOVABLE &&
+ migratetype == MIGRATE_UNMOVABLE)
+ return FAIL_UNMOVABLE_TARGET;
+
+ if (cc->mode != COMPACT_ASYNC_MOVABLE &&
+ migratetype == MIGRATE_UNMOVABLE &&
+ rescue_unmovable_pageblock(page))
+ return GOOD_AS_MIGRATION_TARGET;
+
+ /* Otherwise skip the block */
+ return FAIL_BAD_TARGET;
+}
+
+/*
+ * Based on information in the current compact_control, find blocks
+ * suitable for isolating free pages from and then isolate them.
+ */
+static void isolate_freepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ struct page *page;
+ unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
+ unsigned long flags;
+ int nr_freepages = cc->nr_freepages;
+ struct list_head *freelist = &cc->freepages;
+
+ /*
+ * Initialise the free scanner. The starting point is where we last
+ * scanned from (or the end of the zone if starting). The low point
+ * is the end of the pageblock the migration scanner is using.
+ */
+ pfn = cc->free_pfn;
+ low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+
+ /*
+ * Take care that if the migration scanner is at the end of the zone
+ * that the free scanner does not accidentally move to the next zone
+ * in the next isolation cycle.
+ */
+ high_pfn = min(low_pfn, pfn);
+
+ zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+
+ /*
+ * isolate_freepages() may be called more than once during
+ * compact_zone_order() run and we want only the most recent
+ * count.
+ */
+ cc->nr_pageblocks_skipped = 0;
+
+ /*
+ * Isolate free pages until enough are available to migrate the
+ * pages on cc->migratepages. We stop searching if the migrate
+ * and free page scanners meet or enough free pages are isolated.
+ */
+ for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+ pfn -= pageblock_nr_pages) {
+ unsigned long isolated;
+ enum smt_result ret;
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ /*
+ * Check for overlapping nodes/zones. It's possible on some
+ * configurations to have a setup like
+ * node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of
+ * pages do not belong to a single zone.
+ */
+ page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ continue;
+
+ /* Check the block is suitable for migration */
+ ret = suitable_migration_target(page, cc);
+ if (ret != GOOD_AS_MIGRATION_TARGET) {
+ if (ret == FAIL_UNMOVABLE_TARGET)
+ cc->nr_pageblocks_skipped++;
+ continue;
+ }
+ /*
+ * Found a block suitable for isolating free pages from. Now
+ * we disabled interrupts, double check things are ok and
+ * isolate the pages. This is to minimise the time IRQs
+ * are disabled
+ */
+ isolated = 0;
+ spin_lock_irqsave(&zone->lock, flags);
+ ret = suitable_migration_target(page, cc);
+ if (ret == GOOD_AS_MIGRATION_TARGET) {
+ end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
+ isolated = isolate_freepages_block(pfn, end_pfn,
+ freelist, false);
+ nr_freepages += isolated;
+ } else if (ret == FAIL_UNMOVABLE_TARGET)
+ cc->nr_pageblocks_skipped++;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /*
+ * Record the highest PFN we isolated pages from. When next
+ * looking for free pages, the search will restart here as
+ * page migration may have returned some pages to the allocator
+ */
+ if (isolated)
+ high_pfn = max(high_pfn, pfn);
+ }
+
+ /* split_free_page does not map the pages */
+ map_pages(freelist);
+
+ cc->free_pfn = high_pfn;
+ cc->nr_freepages = nr_freepages;
}
/*
@@ -449,6 +595,44 @@ static void update_nr_listpages(struct compact_control *cc)
cc->nr_freepages = nr_freepages;
}
+/* possible outcome of isolate_migratepages */
+typedef enum {
+ ISOLATE_ABORT, /* Abort compaction now */
+ ISOLATE_NONE, /* No pages isolated, continue scanning */
+ ISOLATE_SUCCESS, /* Pages isolated, migrate */
+} isolate_migrate_t;
+
+/*
+ * Isolate all pages that can be migrated from the block pointed to by
+ * the migrate scanner within compact_control.
+ */
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ unsigned long low_pfn, end_pfn;
+
+ /* Do not scan outside zone boundaries */
+ low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+
+ /* Only scan within a pageblock boundary */
+ end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
+
+ /* Do not cross the free scanner or scan within a memory hole */
+ if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
+ cc->migrate_pfn = end_pfn;
+ return ISOLATE_NONE;
+ }
+
+ /* Perform the isolation */
+ low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
+ if (!low_pfn)
+ return ISOLATE_ABORT;
+
+ cc->migrate_pfn = low_pfn;
+
+ return ISOLATE_SUCCESS;
+}
+
static int compact_finished(struct zone *zone,
struct compact_control *cc)
{
@@ -578,8 +762,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
- (unsigned long)cc, false,
- cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+ (unsigned long)&cc->freepages, false,
+ (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT
+ : MIGRATE_ASYNC);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
@@ -608,7 +793,8 @@ out:
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
- bool sync)
+ enum compact_mode mode,
+ unsigned long *nr_pageblocks_skipped)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -616,12 +802,17 @@ static unsigned long compact_zone_order(struct zone *zone,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
- .sync = sync,
+ .mode = mode,
};
+ unsigned long rc;
+
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- return compact_zone(zone, &cc);
+ rc = compact_zone(zone, &cc);
+ *nr_pageblocks_skipped = cc.nr_pageblocks_skipped;
+
+ return rc;
}
int sysctl_extfrag_threshold = 500;
@@ -646,6 +837,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
+ unsigned long nr_pageblocks_skipped;
+ enum compact_mode mode;
/*
* Check whether it is worth even starting compaction. The order check is
@@ -662,12 +855,22 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
nodemask) {
int status;
- status = compact_zone_order(zone, order, gfp_mask, sync);
+ mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE;
+retry:
+ status = compact_zone_order(zone, order, gfp_mask, mode,
+ &nr_pageblocks_skipped);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
break;
+
+ if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) {
+ if (nr_pageblocks_skipped) {
+ mode = COMPACT_ASYNC_UNMOVABLE;
+ goto retry;
+ }
+ }
}
return rc;
@@ -701,7 +904,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
if (ok && cc->order > zone->compact_order_failed)
zone->compact_order_failed = cc->order + 1;
/* Currently async compaction is never deferred. */
- else if (!ok && cc->sync)
+ else if (!ok && cc->mode == COMPACT_SYNC)
defer_compaction(zone, cc->order);
}
@@ -716,7 +919,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
{
struct compact_control cc = {
.order = order,
- .sync = false,
+ .mode = COMPACT_ASYNC_MOVABLE,
};
return __compact_pgdat(pgdat, &cc);
@@ -726,7 +929,7 @@ static int compact_node(int nid)
{
struct compact_control cc = {
.order = -1,
- .sync = true,
+ .mode = COMPACT_SYNC,
};
return __compact_pgdat(NODE_DATA(nid), &cc);
@@ -795,3 +998,5 @@ void compaction_unregister_node(struct node *node)
return device_remove_file(&node->dev, &dev_attr_compact);
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+
+#endif /* CONFIG_COMPACTION */
diff --git a/mm/filemap.c b/mm/filemap.c
index 79c4b2b0b14..a4a5260b027 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,6 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
-#include <linux/syscalls.h>
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
@@ -1478,44 +1477,6 @@ out:
}
EXPORT_SYMBOL(generic_file_aio_read);
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t index, unsigned long nr)
-{
- if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
- return -EINVAL;
-
- force_page_cache_readahead(mapping, filp, index, nr);
- return 0;
-}
-
-SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
-{
- ssize_t ret;
- struct file *file;
-
- ret = -EBADF;
- file = fget(fd);
- if (file) {
- if (file->f_mode & FMODE_READ) {
- struct address_space *mapping = file->f_mapping;
- pgoff_t start = offset >> PAGE_CACHE_SHIFT;
- pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
- unsigned long len = end - start + 1;
- ret = do_readahead(mapping, file, start, len);
- }
- fput(file);
- }
- return ret;
-}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
-{
- return SYSC_readahead((int) fd, offset, (size_t) count);
-}
-SYSCALL_ALIAS(sys_readahead, SyS_readahead);
-#endif
-
#ifdef CONFIG_MMU
/**
* page_cache_read - adds requested page to the page cache if not already there
@@ -1938,71 +1899,6 @@ struct page *read_cache_page(struct address_space *mapping,
}
EXPORT_SYMBOL(read_cache_page);
-/*
- * The logic we want is
- *
- * if suid or (sgid and xgrp)
- * remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
- umode_t mode = dentry->d_inode->i_mode;
- int kill = 0;
-
- /* suid always must be killed */
- if (unlikely(mode & S_ISUID))
- kill = ATTR_KILL_SUID;
-
- /*
- * sgid without any exec bits is just a mandatory locking mark; leave
- * it alone. If some exec bits are set, it's a real sgid; kill it.
- */
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- kill |= ATTR_KILL_SGID;
-
- if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
- return kill;
-
- return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-
-static int __remove_suid(struct dentry *dentry, int kill)
-{
- struct iattr newattrs;
-
- newattrs.ia_valid = ATTR_FORCE | kill;
- return notify_change(dentry, &newattrs);
-}
-
-int file_remove_suid(struct file *file)
-{
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- int killsuid;
- int killpriv;
- int error = 0;
-
- /* Fast path for nothing security related */
- if (IS_NOSEC(inode))
- return 0;
-
- killsuid = should_remove_suid(dentry);
- killpriv = security_inode_need_killpriv(dentry);
-
- if (killpriv < 0)
- return killpriv;
- if (killpriv)
- error = security_inode_killpriv(dentry);
- if (!error && killsuid)
- error = __remove_suid(dentry, killsuid);
- if (!error && (inode->i_sb->s_flags & MS_NOSEC))
- inode->i_flags |= S_NOSEC;
-
- return error;
-}
-EXPORT_SYMBOL(file_remove_suid);
-
static size_t __iovec_copy_from_user_inatomic(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
{
@@ -2528,7 +2424,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err)
goto out;
- file_update_time(file);
+ err = file_update_time(file);
+ if (err)
+ goto out;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index a4eb3113222..213ca1f5340 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -426,7 +426,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
if (ret)
goto out_backing;
- file_update_time(filp);
+ ret = file_update_time(filp);
+ if (ret)
+ goto out_backing;
ret = __xip_file_write (filp, buf, count, pos, ppos);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e5306eeb5..57c4b930901 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
unsigned long haddr, pmd_t *pmd,
struct page *page)
{
- int ret = 0;
pgtable_t pgtable;
VM_BUG_ON(!PageCompound(page));
pgtable = pte_alloc_one(mm, haddr);
- if (unlikely(!pgtable)) {
- mem_cgroup_uncharge_page(page);
- put_page(page);
+ if (unlikely(!pgtable))
return VM_FAULT_OOM;
- }
clear_huge_page(page, haddr, HPAGE_PMD_NR);
__SetPageUptodate(page);
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
spin_unlock(&mm->page_table_lock);
}
- return ret;
+ return 0;
}
static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
put_page(page);
goto out;
}
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
+ page))) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ goto out;
+ }
- return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+ return 0;
}
out:
/*
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
+ if (ret & VM_FAULT_OOM)
+ split_huge_page(page);
put_page(page);
goto out;
}
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
+ split_huge_page(page);
put_page(page);
ret |= VM_FAULT_OOM;
goto out;
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mm->page_table_lock);
put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+ spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
+ goto out;
} else {
pmd_t entry;
VM_BUG_ON(!PageHead(page));
@@ -1224,10 +1231,13 @@ static void __split_huge_page_refcount(struct page *page)
{
int i;
struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
int tail_count = 0;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
compound_lock(page);
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(page);
@@ -1302,13 +1312,12 @@ static void __split_huge_page_refcount(struct page *page)
BUG_ON(!PageDirty(page_tail));
BUG_ON(!PageSwapBacked(page_tail));
-
- lru_add_page_tail(zone, page, page_tail);
+ lru_add_page_tail(page, page_tail, lruvec);
}
atomic_sub(tail_count, &page->_count);
BUG_ON(atomic_read(&page->_count) <= 0);
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
ClearPageCompound(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b8ce6f45095..e198831276a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t)
/* Locate each segment we overlap with, and count that overlap. */
list_for_each_entry(rg, head, link) {
- int seg_from;
- int seg_to;
+ long seg_from;
+ long seg_to;
if (rg->to <= f)
continue;
@@ -532,7 +532,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
{
- struct page *page;
+ struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
struct zonelist *zonelist;
@@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
kref_get(&reservations->refs);
}
+static void resv_map_put(struct vm_area_struct *vma)
+{
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ if (!reservations)
+ return;
+ kref_put(&reservations->refs, resv_map_release);
+}
+
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
@@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
reserve = (end - start) -
region_count(&reservations->regions, start, end);
- kref_put(&reservations->refs, resv_map_release);
+ resv_map_put(vma);
if (reserve) {
hugetlb_acct_memory(h, -reserve);
@@ -2213,6 +2222,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
+ entry = arch_make_huge_pte(entry, vma, page, writable);
return entry;
}
@@ -2498,7 +2508,6 @@ retry_avoidcopy:
if (outside_reserve) {
BUG_ON(huge_pte_none(pte));
if (unmap_ref_private(mm, vma, old_page, address)) {
- BUG_ON(page_count(old_page) != 1);
BUG_ON(huge_pte_none(pte));
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
@@ -2791,6 +2800,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* so no worry about deadlock.
*/
page = pte_page(entry);
+ get_page(page);
if (page != pagecache_page)
lock_page(page);
@@ -2822,6 +2832,7 @@ out_page_table_lock:
}
if (page != pagecache_page)
unlock_page(page);
+ put_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2989,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode,
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
- if (chg < 0)
- return chg;
+ if (chg < 0) {
+ ret = chg;
+ goto out_err;
+ }
/* There must be enough pages in the subpool for the mapping */
- if (hugepage_subpool_get_pages(spool, chg))
- return -ENOSPC;
+ if (hugepage_subpool_get_pages(spool, chg)) {
+ ret = -ENOSPC;
+ goto out_err;
+ }
/*
* Check enough hugepages are available for the reservation.
@@ -3003,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode,
ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
hugepage_subpool_put_pages(spool, chg);
- return ret;
+ goto out_err;
}
/*
@@ -3020,6 +3035,10 @@ int hugetlb_reserve_pages(struct inode *inode,
if (!vma || vma->vm_flags & VM_MAYSHARE)
region_add(&inode->i_mapping->private_list, from, to);
return 0;
+out_err:
+ if (vma)
+ resv_map_put(vma);
+ return ret;
}
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/internal.h b/mm/internal.h
index 2189af49178..5cbb7819004 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -94,12 +94,52 @@ extern void putback_lru_page(struct page *page);
/*
* in mm/page_alloc.c
*/
+extern void set_pageblock_migratetype(struct page *page, int migratetype);
+extern int move_freepages_block(struct zone *zone, struct page *page,
+ int migratetype);
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+#include <linux/compaction.h>
+
+/*
+ * in mm/compaction.c
+ */
+/*
+ * compact_control is used to track pages being migrated and the free pages
+ * they are being migrated to during memory compaction. The free_pfn starts
+ * at the end of a zone and migrate_pfn begins at the start. Movable pages
+ * are moved to the end of a zone during a compaction run and the run
+ * completes when free_pfn <= migrate_pfn
+ */
+struct compact_control {
+ struct list_head freepages; /* List of free pages to migrate to */
+ struct list_head migratepages; /* List of pages being migrated */
+ unsigned long nr_freepages; /* Number of isolated free pages */
+ unsigned long nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+ enum compact_mode mode; /* Compaction mode */
+
+ int order; /* order a direct compactor needs */
+ int migratetype; /* MOVABLE, RECLAIMABLE etc */
+ struct zone *zone;
+
+ /* Number of UNMOVABLE destination pageblocks skipped during scan */
+ unsigned long nr_pageblocks_skipped;
+};
+
+unsigned long
+isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
+unsigned long
+isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ unsigned long low_pfn, unsigned long end_pfn);
+
+#endif
/*
* function for dealing with page's order in buddy system.
@@ -131,7 +171,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
* to determine if it's being mapped into a LOCKED vma.
* If so, mark page as mlocked.
*/
-static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
+ struct page *page)
{
VM_BUG_ON(PageLRU(page));
@@ -189,7 +230,7 @@ extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
#endif
#else /* !CONFIG_MMU */
-static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
+static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
{
return 0;
}
@@ -309,3 +350,7 @@ extern u64 hwpoison_filter_flags_mask;
extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
+
+extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
diff --git a/mm/madvise.c b/mm/madvise.c
index 1ccbba5b667..deff1b64a08 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,8 +11,10 @@
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
#include <linux/hugetlb.h>
+#include <linux/falloc.h>
#include <linux/sched.h>
#include <linux/ksm.h>
+#include <linux/fs.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -200,8 +202,7 @@ static long madvise_remove(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
- struct address_space *mapping;
- loff_t offset, endoff;
+ loff_t offset;
int error;
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
@@ -217,16 +218,14 @@ static long madvise_remove(struct vm_area_struct *vma,
if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
return -EACCES;
- mapping = vma->vm_file->f_mapping;
-
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- endoff = (loff_t)(end - vma->vm_start - 1)
- + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- /* vmtruncate_range needs to take i_mutex */
+ /* filesystem's fallocate may need to take i_mutex */
up_read(&current->mm->mmap_sem);
- error = vmtruncate_range(mapping->host, offset, endoff);
+ error = do_fallocate(vma->vm_file,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ offset, end - start);
down_read(&current->mm->mmap_sem);
return error;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 99f28559950..952123eba43 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = {
int memblock_debug __initdata_memblock;
static int memblock_can_resize __initdata_memblock;
+static int memblock_memory_in_slab __initdata_memblock = 0;
+static int memblock_reserved_in_slab __initdata_memblock = 0;
/* inline so we don't get a warning when pr_debug is compiled out */
static inline const char *memblock_type_name(struct memblock_type *type)
@@ -187,6 +189,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
struct memblock_region *new_array, *old_array;
phys_addr_t old_size, new_size, addr;
int use_slab = slab_is_available();
+ int *in_slab;
/* We don't allow resizing until we know about the reserved regions
* of memory that aren't suitable for allocation
@@ -198,6 +201,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
old_size = type->max * sizeof(struct memblock_region);
new_size = old_size << 1;
+ /* Retrieve the slab flag */
+ if (type == &memblock.memory)
+ in_slab = &memblock_memory_in_slab;
+ else
+ in_slab = &memblock_reserved_in_slab;
+
/* Try to find some space for it.
*
* WARNING: We assume that either slab_is_available() and we use it or
@@ -212,14 +221,15 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
if (use_slab) {
new_array = kmalloc(new_size, GFP_KERNEL);
addr = new_array ? __pa(new_array) : 0;
- } else
+ } else {
addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+ new_array = addr ? __va(addr) : 0;
+ }
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
memblock_type_name(type), type->max, type->max * 2);
return -1;
}
- new_array = __va(addr);
memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
@@ -234,22 +244,24 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
type->regions = new_array;
type->max <<= 1;
- /* If we use SLAB that's it, we are done */
- if (use_slab)
- return 0;
-
- /* Add the new reserved region now. Should not fail ! */
- BUG_ON(memblock_reserve(addr, new_size));
-
- /* If the array wasn't our static init one, then free it. We only do
- * that before SLAB is available as later on, we don't know whether
- * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
- * anyways
+ /* Free old array. We needn't free it if the array is the
+ * static one
*/
- if (old_array != memblock_memory_init_regions &&
- old_array != memblock_reserved_init_regions)
+ if (*in_slab)
+ kfree(old_array);
+ else if (old_array != memblock_memory_init_regions &&
+ old_array != memblock_reserved_init_regions)
memblock_free(__pa(old_array), old_size);
+ /* Reserve the new array if that comes from the memblock.
+ * Otherwise, we needn't do it
+ */
+ if (!use_slab)
+ BUG_ON(memblock_reserve(addr, new_size));
+
+ /* Update slab flag */
+ *in_slab = use_slab;
+
return 0;
}
@@ -330,6 +342,9 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
phys_addr_t end = base + memblock_cap_size(base, &size);
int i, nr_new;
+ if (!size)
+ return 0;
+
/* special case for empty array */
if (type->regions[0].size == 0) {
WARN_ON(type->cnt != 1 || type->total_size);
@@ -430,6 +445,9 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
*start_rgn = *end_rgn = 0;
+ if (!size)
+ return 0;
+
/* we'll create at most two more regions */
while (type->cnt + 2 > type->max)
if (memblock_double_array(type) < 0)
@@ -514,7 +532,6 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
(unsigned long long)base,
(unsigned long long)base + size,
(void *)_RET_IP_);
- BUG_ON(0 == size);
return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7d698df4a06..ac35bccadb7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,7 +59,7 @@
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
-struct mem_cgroup *root_mem_cgroup __read_mostly;
+static struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -73,7 +73,7 @@ static int really_do_swap_account __initdata = 0;
#endif
#else
-#define do_swap_account (0)
+#define do_swap_account 0
#endif
@@ -88,18 +88,31 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
- MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
MEM_CGROUP_STAT_NSTATS,
};
+static const char * const mem_cgroup_stat_names[] = {
+ "cache",
+ "rss",
+ "mapped_file",
+ "swap",
+};
+
enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
- MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
MEM_CGROUP_EVENTS_NSTATS,
};
+
+static const char * const mem_cgroup_events_names[] = {
+ "pgpgin",
+ "pgpgout",
+ "pgfault",
+ "pgmajfault",
+};
+
/*
* Per memcg event counter is incremented at every pagein/pageout. With THP,
* it will be incremated by the number of pages. This counter is used for
@@ -112,13 +125,14 @@ enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
-#define THRESHOLDS_EVENTS_TARGET (128)
-#define SOFTLIMIT_EVENTS_TARGET (1024)
-#define NUMAINFO_EVENTS_TARGET (1024)
+#define THRESHOLDS_EVENTS_TARGET 128
+#define SOFTLIMIT_EVENTS_TARGET 1024
+#define NUMAINFO_EVENTS_TARGET 1024
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+ unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};
@@ -138,7 +152,6 @@ struct mem_cgroup_per_zone {
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
- struct zone_reclaim_stat reclaim_stat;
struct rb_node tree_node; /* RB tree node */
unsigned long long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
@@ -182,7 +195,7 @@ struct mem_cgroup_threshold {
/* For threshold */
struct mem_cgroup_threshold_ary {
- /* An array index points to threshold just below usage. */
+ /* An array index points to threshold just below or equal to usage. */
int current_threshold;
/* Size of entries[] */
unsigned int size;
@@ -245,8 +258,8 @@ struct mem_cgroup {
*/
struct rcu_head rcu_freeing;
/*
- * But when using vfree(), that cannot be done at
- * interrupt time, so we must then queue the work.
+ * We also need some space for a worker in deferred freeing.
+ * By the time we call it, rcu_freeing is no longer in use.
*/
struct work_struct work_freeing;
};
@@ -305,7 +318,7 @@ struct mem_cgroup {
/*
* percpu counter.
*/
- struct mem_cgroup_stat_cpu *stat;
+ struct mem_cgroup_stat_cpu __percpu *stat;
/*
* used when a cpu is offlined or other synchronizations
* See mem_cgroup_read_stat().
@@ -360,8 +373,8 @@ static bool move_file(void)
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
-#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
-#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
+#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
+#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -377,8 +390,8 @@ enum charge_type {
#define _MEM (0)
#define _MEMSWAP (1)
#define _OOM_TYPE (2)
-#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
-#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
+#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
@@ -404,6 +417,7 @@ void sock_update_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled) {
struct mem_cgroup *memcg;
+ struct cg_proto *cg_proto;
BUG_ON(!sk->sk_prot->proto_cgroup);
@@ -423,9 +437,10 @@ void sock_update_memcg(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
- if (!mem_cgroup_is_root(memcg)) {
+ cg_proto = sk->sk_prot->proto_cgroup(memcg);
+ if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
mem_cgroup_get(memcg);
- sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+ sk->sk_cgrp = cg_proto;
}
rcu_read_unlock();
}
@@ -454,6 +469,19 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
#endif /* CONFIG_INET */
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+ if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+ return;
+ static_key_slow_dec(&memcg_socket_limit_enabled);
+}
+#else
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+}
+#endif
+
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
@@ -718,12 +746,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
+ __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
preempt_enable();
}
unsigned long
+mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ return mz->lru_size[lru];
+}
+
+static unsigned long
mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
unsigned int lru_mask)
{
@@ -770,7 +807,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
{
unsigned long val, next;
- val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+ val = __this_cpu_read(memcg->stat->nr_page_events);
next = __this_cpu_read(memcg->stat->targets[target]);
/* from time_after() in jiffies.h */
if ((long)next - (long)val < 0) {
@@ -1013,7 +1050,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
- * @mem: memcg of the wanted lruvec
+ * @memcg: memcg of the wanted lruvec
*
* Returns the lru list vector holding pages for the given @zone and
* @mem. This can be the global zone lruvec, if the memory controller
@@ -1046,19 +1083,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
*/
/**
- * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
- * @zone: zone of the page
+ * mem_cgroup_page_lruvec - return lruvec for adding an lru page
* @page: the page
- * @lru: current lru
- *
- * This function accounts for @page being added to @lru, and returns
- * the lruvec for the given @zone and the memcg @page is charged to.
- *
- * The callsite is then responsible for physically linking the page to
- * the returned lruvec->lists[@lru].
+ * @zone: zone of the page
*/
-struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
- enum lru_list lru)
+struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
{
struct mem_cgroup_per_zone *mz;
struct mem_cgroup *memcg;
@@ -1071,7 +1100,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
memcg = pc->mem_cgroup;
/*
- * Surreptitiously switch any uncharged page to root:
+ * Surreptitiously switch any uncharged offlist page to root:
* an uncharged page off lru does nothing to secure
* its former mem_cgroup from sudden removal.
*
@@ -1079,85 +1108,60 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
* under page_cgroup lock: between them, they make all uses
* of pc->mem_cgroup safe.
*/
- if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
+ if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
pc->mem_cgroup = memcg = root_mem_cgroup;
mz = page_cgroup_zoneinfo(memcg, page);
- /* compound_order() is stabilized through lru_lock */
- mz->lru_size[lru] += 1 << compound_order(page);
return &mz->lruvec;
}
/**
- * mem_cgroup_lru_del_list - account for removing an lru page
- * @page: the page
- * @lru: target lru
+ * mem_cgroup_update_lru_size - account for adding or removing an lru page
+ * @lruvec: mem_cgroup per zone lru vector
+ * @lru: index of lru list the page is sitting on
+ * @nr_pages: positive when adding or negative when removing
*
- * This function accounts for @page being removed from @lru.
- *
- * The callsite is then responsible for physically unlinking
- * @page->lru.
+ * This function must be called when a page is added to or removed from an
+ * lru list.
*/
-void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int nr_pages)
{
struct mem_cgroup_per_zone *mz;
- struct mem_cgroup *memcg;
- struct page_cgroup *pc;
+ unsigned long *lru_size;
if (mem_cgroup_disabled())
return;
- pc = lookup_page_cgroup(page);
- memcg = pc->mem_cgroup;
- VM_BUG_ON(!memcg);
- mz = page_cgroup_zoneinfo(memcg, page);
- /* huge page split is done under lru_lock. so, we have no races. */
- VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
- mz->lru_size[lru] -= 1 << compound_order(page);
-}
-
-void mem_cgroup_lru_del(struct page *page)
-{
- mem_cgroup_lru_del_list(page, page_lru(page));
-}
-
-/**
- * mem_cgroup_lru_move_lists - account for moving a page between lrus
- * @zone: zone of the page
- * @page: the page
- * @from: current lru
- * @to: target lru
- *
- * This function accounts for @page being moved between the lrus @from
- * and @to, and returns the lruvec for the given @zone and the memcg
- * @page is charged to.
- *
- * The callsite is then responsible for physically relinking
- * @page->lru to the returned lruvec->lists[@to].
- */
-struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
- struct page *page,
- enum lru_list from,
- enum lru_list to)
-{
- /* XXX: Optimize this, especially for @from == @to */
- mem_cgroup_lru_del_list(page, from);
- return mem_cgroup_lru_add_list(zone, page, to);
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ lru_size = mz->lru_size + lru;
+ *lru_size += nr_pages;
+ VM_BUG_ON((long)(*lru_size) < 0);
}
/*
* Checks whether given mem is same or in the root_mem_cgroup's
* hierarchy subtree
*/
+bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+ struct mem_cgroup *memcg)
+{
+ if (root_memcg == memcg)
+ return true;
+ if (!root_memcg->use_hierarchy)
+ return false;
+ return css_is_ancestor(&memcg->css, &root_memcg->css);
+}
+
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
- struct mem_cgroup *memcg)
+ struct mem_cgroup *memcg)
{
- if (root_memcg != memcg) {
- return (root_memcg->use_hierarchy &&
- css_is_ancestor(&memcg->css, &root_memcg->css));
- }
+ bool ret;
- return true;
+ rcu_read_lock();
+ ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
+ rcu_read_unlock();
+ return ret;
}
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
@@ -1195,19 +1199,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
return ret;
}
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
{
unsigned long inactive_ratio;
- int nid = zone_to_nid(zone);
- int zid = zone_idx(zone);
unsigned long inactive;
unsigned long active;
unsigned long gb;
- inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
- BIT(LRU_INACTIVE_ANON));
- active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
- BIT(LRU_ACTIVE_ANON));
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -1218,49 +1218,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
return inactive * inactive_ratio < active;
}
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
+int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
{
unsigned long active;
unsigned long inactive;
- int zid = zone_idx(zone);
- int nid = zone_to_nid(zone);
- inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
- BIT(LRU_INACTIVE_FILE));
- active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
- BIT(LRU_ACTIVE_FILE));
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
return (active > inactive);
}
-struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
- struct zone *zone)
-{
- int nid = zone_to_nid(zone);
- int zid = zone_idx(zone);
- struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-
- return &mz->reclaim_stat;
-}
-
-struct zone_reclaim_stat *
-mem_cgroup_get_reclaim_stat_from_page(struct page *page)
-{
- struct page_cgroup *pc;
- struct mem_cgroup_per_zone *mz;
-
- if (mem_cgroup_disabled())
- return NULL;
-
- pc = lookup_page_cgroup(page);
- if (!PageCgroupUsed(pc))
- return NULL;
- /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
- smp_rmb();
- mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
- return &mz->reclaim_stat;
-}
-
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1634,7 +1602,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
-bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
@@ -1669,7 +1637,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
return 0;
}
-bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
}
@@ -1843,7 +1811,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
-bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
+ int order)
{
struct oom_wait_info owait;
bool locked, need_to_kill;
@@ -1992,7 +1961,7 @@ struct memcg_stock_pcp {
unsigned int nr_pages;
struct work_struct work;
unsigned long flags;
-#define FLUSHING_CACHED_CHARGE (0)
+#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
@@ -2139,7 +2108,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
int i;
spin_lock(&memcg->pcp_counter_lock);
- for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
long x = per_cpu(memcg->stat->count[i], cpu);
per_cpu(memcg->stat->count[i], cpu) = 0;
@@ -2165,7 +2134,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
if (action == CPU_ONLINE)
return NOTIFY_OK;
- if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
for_each_mem_cgroup(iter)
@@ -2427,6 +2396,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
}
/*
+ * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
+ * This is useful when moving usage to parent cgroup.
+ */
+static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ unsigned long bytes = nr_pages * PAGE_SIZE;
+
+ if (mem_cgroup_is_root(memcg))
+ return;
+
+ res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
+ if (do_swap_account)
+ res_counter_uncharge_until(&memcg->memsw,
+ memcg->memsw.parent, bytes);
+}
+
+/*
* A helper function to get mem_cgroup from ID. must be called under
* rcu_read_lock(). The caller must check css_is_removed() or some if
* it's concern. (dropping refcnt from swap can be called against removed
@@ -2476,11 +2463,12 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
struct page *page,
unsigned int nr_pages,
- struct page_cgroup *pc,
enum charge_type ctype,
bool lrucare)
{
+ struct page_cgroup *pc = lookup_page_cgroup(page);
struct zone *uninitialized_var(zone);
+ struct lruvec *lruvec;
bool was_on_lru = false;
bool anon;
@@ -2503,8 +2491,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page)) {
+ lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
ClearPageLRU(page);
- del_page_from_lru_list(zone, page, page_lru(page));
+ del_page_from_lru_list(page, lruvec, page_lru(page));
was_on_lru = true;
}
}
@@ -2522,9 +2511,10 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
if (lrucare) {
if (was_on_lru) {
+ lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
- add_page_to_lru_list(zone, page, page_lru(page));
+ add_page_to_lru_list(page, lruvec, page_lru(page));
}
spin_unlock_irq(&zone->lru_lock);
}
@@ -2547,7 +2537,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
+#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
/*
* Because tail pages are not marked as "used", set it. We're under
* zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2578,23 +2568,19 @@ void mem_cgroup_split_huge_fixup(struct page *head)
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
- * @uncharge: whether we should call uncharge and css_put against @from.
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
* - compound_lock is held when nr_pages > 1
*
- * This function doesn't do "charge" nor css_get to new cgroup. It should be
- * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
- * true, this function does "uncharge" from old cgroup, but it doesn't if
- * @uncharge is false, so a caller should do "uncharge".
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
*/
static int mem_cgroup_move_account(struct page *page,
unsigned int nr_pages,
struct page_cgroup *pc,
struct mem_cgroup *from,
- struct mem_cgroup *to,
- bool uncharge)
+ struct mem_cgroup *to)
{
unsigned long flags;
int ret;
@@ -2628,9 +2614,6 @@ static int mem_cgroup_move_account(struct page *page,
preempt_enable();
}
mem_cgroup_charge_statistics(from, anon, -nr_pages);
- if (uncharge)
- /* This is not "cancel", but cancel_charge does all we need. */
- __mem_cgroup_cancel_charge(from, nr_pages);
/* caller should have done css_get */
pc->mem_cgroup = to;
@@ -2664,15 +2647,13 @@ static int mem_cgroup_move_parent(struct page *page,
struct mem_cgroup *child,
gfp_t gfp_mask)
{
- struct cgroup *cg = child->css.cgroup;
- struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent;
unsigned int nr_pages;
unsigned long uninitialized_var(flags);
int ret;
/* Is ROOT ? */
- if (!pcg)
+ if (mem_cgroup_is_root(child))
return -EINVAL;
ret = -EBUSY;
@@ -2683,21 +2664,23 @@ static int mem_cgroup_move_parent(struct page *page,
nr_pages = hpage_nr_pages(page);
- parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
- if (ret)
- goto put_back;
+ parent = parent_mem_cgroup(child);
+ /*
+ * If no parent, move charges to root cgroup.
+ */
+ if (!parent)
+ parent = root_mem_cgroup;
if (nr_pages > 1)
flags = compound_lock_irqsave(page);
- ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
- if (ret)
- __mem_cgroup_cancel_charge(parent, nr_pages);
+ ret = mem_cgroup_move_account(page, nr_pages,
+ pc, child, parent);
+ if (!ret)
+ __mem_cgroup_cancel_local_charge(child, nr_pages);
if (nr_pages > 1)
compound_unlock_irqrestore(page, flags);
-put_back:
putback_lru_page(page);
put:
put_page(page);
@@ -2716,7 +2699,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
{
struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
- struct page_cgroup *pc;
bool oom = true;
int ret;
@@ -2730,11 +2712,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
oom = false;
}
- pc = lookup_page_cgroup(page);
ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
if (ret == -ENOMEM)
return ret;
- __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
+ __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
return 0;
}
@@ -2831,16 +2812,13 @@ static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
enum charge_type ctype)
{
- struct page_cgroup *pc;
-
if (mem_cgroup_disabled())
return;
if (!memcg)
return;
cgroup_exclude_rmdir(&memcg->css);
- pc = lookup_page_cgroup(page);
- __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
+ __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
@@ -2850,24 +2828,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
*/
if (do_swap_account && PageSwapCache(page)) {
swp_entry_t ent = {.val = page_private(page)};
- struct mem_cgroup *swap_memcg;
- unsigned short id;
-
- id = swap_cgroup_record(ent, 0);
- rcu_read_lock();
- swap_memcg = mem_cgroup_lookup(id);
- if (swap_memcg) {
- /*
- * This recorded memcg can be obsolete one. So, avoid
- * calling css_tryget
- */
- if (!mem_cgroup_is_root(swap_memcg))
- res_counter_uncharge(&swap_memcg->memsw,
- PAGE_SIZE);
- mem_cgroup_swap_statistics(swap_memcg, false);
- mem_cgroup_put(swap_memcg);
- }
- rcu_read_unlock();
+ mem_cgroup_uncharge_swap(ent);
}
/*
* At swapin, we may charge account against cgroup which has no tasks.
@@ -3160,7 +3121,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
* @entry: swap entry to be moved
* @from: mem_cgroup which the entry is moved from
* @to: mem_cgroup which the entry is moved to
- * @need_fixup: whether we should fixup res_counters and refcounts.
*
* It succeeds only when the swap_cgroup's record for this entry is the same
* as the mem_cgroup's id of @from.
@@ -3171,7 +3131,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
* both res and memsw, and called css_get().
*/
static int mem_cgroup_move_swap_account(swp_entry_t entry,
- struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+ struct mem_cgroup *from, struct mem_cgroup *to)
{
unsigned short old_id, new_id;
@@ -3190,24 +3150,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
* swap-in, the refcount of @to might be decreased to 0.
*/
mem_cgroup_get(to);
- if (need_fixup) {
- if (!mem_cgroup_is_root(from))
- res_counter_uncharge(&from->memsw, PAGE_SIZE);
- mem_cgroup_put(from);
- /*
- * we charged both to->res and to->memsw, so we should
- * uncharge to->res.
- */
- if (!mem_cgroup_is_root(to))
- res_counter_uncharge(&to->res, PAGE_SIZE);
- }
return 0;
}
return -EINVAL;
}
#else
static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
- struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+ struct mem_cgroup *from, struct mem_cgroup *to)
{
return -EINVAL;
}
@@ -3298,14 +3247,13 @@ int mem_cgroup_prepare_migration(struct page *page,
* page. In the case new page is migrated but not remapped, new page's
* mapcount will be finally 0 and we call uncharge in end_migration().
*/
- pc = lookup_page_cgroup(newpage);
if (PageAnon(page))
ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
else if (page_is_file_cache(page))
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
+ __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
return ret;
}
@@ -3369,7 +3317,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
void mem_cgroup_replace_page_cache(struct page *oldpage,
struct page *newpage)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
@@ -3379,11 +3327,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
pc = lookup_page_cgroup(oldpage);
/* fix accounting on old pages */
lock_page_cgroup(pc);
- memcg = pc->mem_cgroup;
- mem_cgroup_charge_statistics(memcg, false, -1);
- ClearPageCgroupUsed(pc);
+ if (PageCgroupUsed(pc)) {
+ memcg = pc->mem_cgroup;
+ mem_cgroup_charge_statistics(memcg, false, -1);
+ ClearPageCgroupUsed(pc);
+ }
unlock_page_cgroup(pc);
+ /*
+ * When called from shmem_replace_page(), in some cases the
+ * oldpage has already been charged, and in some cases not.
+ */
+ if (!memcg)
+ return;
+
if (PageSwapBacked(oldpage))
type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
@@ -3392,7 +3349,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
* the newpage may be on LRU(or pagevec for LRU) already. We lock
* LRU while we overwrite pc->mem_cgroup.
*/
- __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
+ __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
}
#ifdef CONFIG_DEBUG_VM
@@ -3763,7 +3720,7 @@ move_account:
goto try_to_free;
cond_resched();
/* "ret" should also be checked to ensure all lists are empty. */
- } while (memcg->res.usage > 0 || ret);
+ } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
out:
css_put(&memcg->css);
return ret;
@@ -3778,7 +3735,7 @@ try_to_free:
lru_add_drain_all();
/* try to free all pages in this cgroup */
shrink = 1;
- while (nr_retries && memcg->res.usage > 0) {
+ while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
int progress;
if (signal_pending(current)) {
@@ -3799,7 +3756,7 @@ try_to_free:
goto move_account;
}
-int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
+static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
}
@@ -3879,14 +3836,21 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
return val << PAGE_SHIFT;
}
-static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
+static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
+ struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ char str[64];
u64 val;
- int type, name;
+ int type, name, len;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
+
+ if (!do_swap_account && type == _MEMSWAP)
+ return -EOPNOTSUPP;
+
switch (type) {
case _MEM:
if (name == RES_USAGE)
@@ -3903,7 +3867,9 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
default:
BUG();
}
- return val;
+
+ len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+ return simple_read_from_buffer(buf, nbytes, ppos, str, len);
}
/*
* The user of this function is...
@@ -3919,6 +3885,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
+
+ if (!do_swap_account && type == _MEMSWAP)
+ return -EOPNOTSUPP;
+
switch (name) {
case RES_LIMIT:
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -3984,12 +3954,15 @@ out:
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
int type, name;
- memcg = mem_cgroup_from_cont(cont);
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
+
+ if (!do_swap_account && type == _MEMSWAP)
+ return -EOPNOTSUPP;
+
switch (name) {
case RES_MAX_USAGE:
if (type == _MEM)
@@ -4041,103 +4014,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
}
#endif
-
-/* For read statistics */
-enum {
- MCS_CACHE,
- MCS_RSS,
- MCS_FILE_MAPPED,
- MCS_PGPGIN,
- MCS_PGPGOUT,
- MCS_SWAP,
- MCS_PGFAULT,
- MCS_PGMAJFAULT,
- MCS_INACTIVE_ANON,
- MCS_ACTIVE_ANON,
- MCS_INACTIVE_FILE,
- MCS_ACTIVE_FILE,
- MCS_UNEVICTABLE,
- NR_MCS_STAT,
-};
-
-struct mcs_total_stat {
- s64 stat[NR_MCS_STAT];
-};
-
-struct {
- char *local_name;
- char *total_name;
-} memcg_stat_strings[NR_MCS_STAT] = {
- {"cache", "total_cache"},
- {"rss", "total_rss"},
- {"mapped_file", "total_mapped_file"},
- {"pgpgin", "total_pgpgin"},
- {"pgpgout", "total_pgpgout"},
- {"swap", "total_swap"},
- {"pgfault", "total_pgfault"},
- {"pgmajfault", "total_pgmajfault"},
- {"inactive_anon", "total_inactive_anon"},
- {"active_anon", "total_active_anon"},
- {"inactive_file", "total_inactive_file"},
- {"active_file", "total_active_file"},
- {"unevictable", "total_unevictable"}
-};
-
-
-static void
-mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
-{
- s64 val;
-
- /* per cpu stat */
- val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
- s->stat[MCS_CACHE] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
- s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
- s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
- s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
- s->stat[MCS_PGPGOUT] += val;
- if (do_swap_account) {
- val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
- s->stat[MCS_SWAP] += val * PAGE_SIZE;
- }
- val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
- s->stat[MCS_PGFAULT] += val;
- val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
- s->stat[MCS_PGMAJFAULT] += val;
-
- /* per zone stat */
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
- s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
- s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
- s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
- s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
- s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
-}
-
-static void
-mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
-{
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg)
- mem_cgroup_get_local_stat(iter, s);
-}
-
#ifdef CONFIG_NUMA
-static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
+static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
{
int nid;
unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
unsigned long node_nr;
- struct cgroup *cont = m->private;
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
@@ -4178,64 +4061,100 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
}
#endif /* CONFIG_NUMA */
+static const char * const mem_cgroup_lru_names[] = {
+ "inactive_anon",
+ "active_anon",
+ "inactive_file",
+ "active_file",
+ "unevictable",
+};
+
+static inline void mem_cgroup_lru_names_not_uptodate(void)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
+}
+
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
- struct cgroup_map_cb *cb)
+ struct seq_file *m)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- struct mcs_total_stat mystat;
- int i;
+ struct mem_cgroup *mi;
+ unsigned int i;
- memset(&mystat, 0, sizeof(mystat));
- mem_cgroup_get_local_stat(memcg, &mystat);
-
-
- for (i = 0; i < NR_MCS_STAT; i++) {
- if (i == MCS_SWAP && !do_swap_account)
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
continue;
- cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
+ seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+ mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
}
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
+ seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
+ mem_cgroup_read_events(memcg, i));
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
+ mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+
/* Hierarchical information */
{
unsigned long long limit, memsw_limit;
memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
- cb->fill(cb, "hierarchical_memory_limit", limit);
+ seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
if (do_swap_account)
- cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
+ seq_printf(m, "hierarchical_memsw_limit %llu\n",
+ memsw_limit);
}
- memset(&mystat, 0, sizeof(mystat));
- mem_cgroup_get_total_stat(memcg, &mystat);
- for (i = 0; i < NR_MCS_STAT; i++) {
- if (i == MCS_SWAP && !do_swap_account)
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ long long val = 0;
+
+ if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
continue;
- cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
+ seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+ }
+
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+ unsigned long long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_read_events(mi, i);
+ seq_printf(m, "total_%s %llu\n",
+ mem_cgroup_events_names[i], val);
+ }
+
+ for (i = 0; i < NR_LRU_LISTS; i++) {
+ unsigned long long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
+ seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
}
#ifdef CONFIG_DEBUG_VM
{
int nid, zid;
struct mem_cgroup_per_zone *mz;
+ struct zone_reclaim_stat *rstat;
unsigned long recent_rotated[2] = {0, 0};
unsigned long recent_scanned[2] = {0, 0};
for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ rstat = &mz->lruvec.reclaim_stat;
- recent_rotated[0] +=
- mz->reclaim_stat.recent_rotated[0];
- recent_rotated[1] +=
- mz->reclaim_stat.recent_rotated[1];
- recent_scanned[0] +=
- mz->reclaim_stat.recent_scanned[0];
- recent_scanned[1] +=
- mz->reclaim_stat.recent_scanned[1];
+ recent_rotated[0] += rstat->recent_rotated[0];
+ recent_rotated[1] += rstat->recent_rotated[1];
+ recent_scanned[0] += rstat->recent_scanned[0];
+ recent_scanned[1] += rstat->recent_scanned[1];
}
- cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
- cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
- cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
- cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
+ seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
+ seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
+ seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
+ seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
}
#endif
@@ -4297,7 +4216,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
usage = mem_cgroup_usage(memcg, swap);
/*
- * current_threshold points to threshold just below usage.
+ * current_threshold points to threshold just below or equal to usage.
* If it's not true, a threshold was crossed after last
* call of __mem_cgroup_threshold().
*/
@@ -4423,14 +4342,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
/* Find current threshold */
new->current_threshold = -1;
for (i = 0; i < size; i++) {
- if (new->entries[i].threshold < usage) {
+ if (new->entries[i].threshold <= usage) {
/*
* new->current_threshold will not be used until
* rcu_assign_pointer(), so it's safe to increment
* it here.
*/
++new->current_threshold;
- }
+ } else
+ break;
}
/* Free old spare buffer and save old primary buffer as spare */
@@ -4499,7 +4419,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
continue;
new->entries[j] = thresholds->primary->entries[i];
- if (new->entries[j].threshold < usage) {
+ if (new->entries[j].threshold <= usage) {
/*
* new->current_threshold will not be used
* until rcu_assign_pointer(), so it's safe to increment
@@ -4513,6 +4433,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
+ /* If all events are unregistered, free the spare array */
+ if (!new) {
+ kfree(thresholds->spare);
+ thresholds->spare = NULL;
+ }
+
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
@@ -4607,46 +4533,23 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
return 0;
}
-#ifdef CONFIG_NUMA
-static const struct file_operations mem_control_numa_stat_file_operations = {
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
-{
- struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
-
- file->f_op = &mem_control_numa_stat_file_operations;
- return single_open(file, mem_control_numa_stat_show, cont);
-}
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
-static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
- /*
- * Part of this would be better living in a separate allocation
- * function, leaving us with just the cgroup tree population work.
- * We, however, depend on state such as network's proto_list that
- * is only initialized after cgroup creation. I found the less
- * cumbersome way to deal with it to defer it all to populate time
- */
- return mem_cgroup_sockets_init(cont, ss);
+ return mem_cgroup_sockets_init(memcg, ss);
};
-static void kmem_cgroup_destroy(struct cgroup *cont)
+static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
{
- mem_cgroup_sockets_destroy(cont);
+ mem_cgroup_sockets_destroy(memcg);
}
#else
-static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
return 0;
}
-static void kmem_cgroup_destroy(struct cgroup *cont)
+static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
{
}
#endif
@@ -4655,7 +4558,7 @@ static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
- .read_u64 = mem_cgroup_read,
+ .read = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
},
@@ -4663,29 +4566,29 @@ static struct cftype mem_cgroup_files[] = {
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read,
+ .read = mem_cgroup_read,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read_u64 = mem_cgroup_read,
+ .read = mem_cgroup_read,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
- .read_u64 = mem_cgroup_read,
+ .read = mem_cgroup_read,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read,
+ .read = mem_cgroup_read,
},
{
.name = "stat",
- .read_map = mem_control_stat_show,
+ .read_seq_string = mem_control_stat_show,
},
{
.name = "force_empty",
@@ -4717,18 +4620,14 @@ static struct cftype mem_cgroup_files[] = {
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
- .open = mem_control_numa_stat_open,
- .mode = S_IRUGO,
+ .read_seq_string = mem_control_numa_stat_show,
},
#endif
-};
-
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
-static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read_u64 = mem_cgroup_read,
+ .read = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
},
@@ -4736,41 +4635,28 @@ static struct cftype memsw_cgroup_files[] = {
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read,
+ .read = mem_cgroup_read,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read_u64 = mem_cgroup_read,
+ .read = mem_cgroup_read,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read,
+ .read = mem_cgroup_read,
},
-};
-
-static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
-{
- if (!do_swap_account)
- return 0;
- return cgroup_add_files(cont, ss, memsw_cgroup_files,
- ARRAY_SIZE(memsw_cgroup_files));
-};
-#else
-static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
-{
- return 0;
-}
#endif
+ { }, /* terminate */
+};
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup_per_zone *mz;
- enum lru_list lru;
int zone, tmp = node;
/*
* This routine is called against possible nodes.
@@ -4788,8 +4674,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
- for_each_lru(lru)
- INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
+ lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
mz->usage_in_excess = 0;
mz->on_tree = false;
mz->memcg = memcg;
@@ -4832,23 +4717,40 @@ out_free:
}
/*
- * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
* but in process context. The work_freeing structure is overlaid
* on the rcu_freeing structure, which itself is overlaid on memsw.
*/
-static void vfree_work(struct work_struct *work)
+static void free_work(struct work_struct *work)
{
struct mem_cgroup *memcg;
+ int size = sizeof(struct mem_cgroup);
memcg = container_of(work, struct mem_cgroup, work_freeing);
- vfree(memcg);
+ /*
+ * We need to make sure that (at least for now), the jump label
+ * destruction code runs outside of the cgroup lock. This is because
+ * get_online_cpus(), which is called from the static_branch update,
+ * can't be called inside the cgroup_lock. cpusets are the ones
+ * enforcing this dependency, so if they ever change, we might as well.
+ *
+ * schedule_work() will guarantee this happens. Be careful if you need
+ * to move this code around, and make sure it is outside
+ * the cgroup_lock.
+ */
+ disarm_sock_keys(memcg);
+ if (size < PAGE_SIZE)
+ kfree(memcg);
+ else
+ vfree(memcg);
}
-static void vfree_rcu(struct rcu_head *rcu_head)
+
+static void free_rcu(struct rcu_head *rcu_head)
{
struct mem_cgroup *memcg;
memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
- INIT_WORK(&memcg->work_freeing, vfree_work);
+ INIT_WORK(&memcg->work_freeing, free_work);
schedule_work(&memcg->work_freeing);
}
@@ -4874,10 +4776,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
- if (sizeof(struct mem_cgroup) < PAGE_SIZE)
- kfree_rcu(memcg, rcu_freeing);
- else
- call_rcu(&memcg->rcu_freeing, vfree_rcu);
+ call_rcu(&memcg->rcu_freeing, free_rcu);
}
static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -5016,6 +4915,17 @@ mem_cgroup_create(struct cgroup *cont)
memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
+
+ error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
+ if (error) {
+ /*
+ * We call put now because our (and parent's) refcnts
+ * are already in place. mem_cgroup_put() will internally
+ * call __mem_cgroup_free, so return directly
+ */
+ mem_cgroup_put(memcg);
+ return ERR_PTR(error);
+ }
return &memcg->css;
free_out:
__mem_cgroup_free(memcg);
@@ -5033,28 +4943,11 @@ static void mem_cgroup_destroy(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- kmem_cgroup_destroy(cont);
+ kmem_cgroup_destroy(memcg);
mem_cgroup_put(memcg);
}
-static int mem_cgroup_populate(struct cgroup_subsys *ss,
- struct cgroup *cont)
-{
- int ret;
-
- ret = cgroup_add_files(cont, ss, mem_cgroup_files,
- ARRAY_SIZE(mem_cgroup_files));
-
- if (!ret)
- ret = register_memsw_files(cont, ss);
-
- if (!ret)
- ret = register_kmem_files(cont, ss);
-
- return ret;
-}
-
#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */
#define PRECHARGE_COUNT_AT_ONCE 256
@@ -5147,7 +5040,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
return NULL;
if (PageAnon(page)) {
/* we don't move shared anon */
- if (!move_anon() || page_mapcount(page) > 2)
+ if (!move_anon())
return NULL;
} else if (!move_file())
/* we ignore mapcount for file pages */
@@ -5158,32 +5051,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
return page;
}
+#ifdef CONFIG_SWAP
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
- int usage_count;
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
if (!move_anon() || non_swap_entry(ent))
return NULL;
- usage_count = mem_cgroup_count_swap_user(ent, &page);
- if (usage_count > 1) { /* we don't move shared anon */
- if (page)
- put_page(page);
- return NULL;
- }
+ /*
+ * Because lookup_swap_cache() updates some statistics counter,
+ * we call find_get_page() with swapper_space directly.
+ */
+ page = find_get_page(&swapper_space, ent.val);
if (do_swap_account)
entry->val = ent.val;
return page;
}
+#else
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+ return NULL;
+}
+#endif
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
- struct inode *inode;
struct address_space *mapping;
pgoff_t pgoff;
@@ -5192,7 +5090,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
if (!move_file())
return NULL;
- inode = vma->vm_file->f_path.dentry->d_inode;
mapping = vma->vm_file->f_mapping;
if (pte_none(ptent))
pgoff = linear_page_index(vma, addr);
@@ -5481,7 +5378,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* part of thp split is not executed yet.
*/
if (pmd_trans_huge_lock(pmd, vma) == 1) {
- if (!mc.precharge) {
+ if (mc.precharge < HPAGE_PMD_NR) {
spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -5491,8 +5388,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (!isolate_lru_page(page)) {
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
- pc, mc.from, mc.to,
- false)) {
+ pc, mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
@@ -5522,7 +5418,7 @@ retry:
goto put;
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, 1, pc,
- mc.from, mc.to, false)) {
+ mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -5533,8 +5429,7 @@ put: /* get_mctgt_type() gets the page */
break;
case MC_TARGET_SWAP:
ent = target.ent;
- if (!mem_cgroup_move_swap_account(ent,
- mc.from, mc.to, false)) {
+ if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
mc.precharge--;
/* we fixup refcnts and charges later. */
mc.moved_swap++;
@@ -5610,7 +5505,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
if (mm) {
if (mc.to)
mem_cgroup_move_charge(mm);
- put_swap_token(mm);
mmput(mm);
}
if (mc.to)
@@ -5638,12 +5532,13 @@ struct cgroup_subsys mem_cgroup_subsys = {
.create = mem_cgroup_create,
.pre_destroy = mem_cgroup_pre_destroy,
.destroy = mem_cgroup_destroy,
- .populate = mem_cgroup_populate,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
+ .base_cftypes = mem_cgroup_files,
.early_init = 0,
.use_id = 1,
+ .__DEPRECATED_clear_css_refs = true,
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 97cc2733551..ab1e7145e29 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1388,23 +1388,23 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
*/
if (!get_page_unless_zero(compound_head(p))) {
if (PageHuge(p)) {
- pr_info("get_any_page: %#lx free huge page\n", pfn);
+ pr_info("%s: %#lx free huge page\n", __func__, pfn);
ret = dequeue_hwpoisoned_huge_page(compound_head(p));
} else if (is_free_buddy_page(p)) {
- pr_info("get_any_page: %#lx free buddy page\n", pfn);
+ pr_info("%s: %#lx free buddy page\n", __func__, pfn);
/* Set hwpoison bit while page is still isolated */
SetPageHWPoison(p);
ret = 0;
} else {
- pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
- pfn, p->flags);
+ pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
+ __func__, pfn, p->flags);
ret = -EIO;
}
} else {
/* Not a free page */
ret = 1;
}
- unset_migratetype_isolate(p);
+ unset_migratetype_isolate(p, MIGRATE_MOVABLE);
unlock_memory_hotplug();
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 6105f475fa8..1b7dc662bf9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1295,7 +1295,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, unsigned long *nr_accounted,
+ unsigned long end_addr,
struct zap_details *details)
{
unsigned long start = max(vma->vm_start, start_addr);
@@ -1307,8 +1307,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (end <= vma->vm_start)
return;
- if (vma->vm_flags & VM_ACCOUNT)
- *nr_accounted += (end - start) >> PAGE_SHIFT;
+ if (vma->vm_file)
+ uprobe_munmap(vma, start, end);
if (unlikely(is_pfn_mapping(vma)))
untrack_pfn_vma(vma, 0, 0);
@@ -1339,8 +1339,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
- * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
- * @details: details of nonlinear truncation or shared cache invalidation
*
* Unmap all pages in the vma list.
*
@@ -1355,15 +1353,13 @@ static void unmap_single_vma(struct mmu_gather *tlb,
*/
void unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, unsigned long *nr_accounted,
- struct zap_details *details)
+ unsigned long end_addr)
{
struct mm_struct *mm = vma->vm_mm;
mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
- unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted,
- details);
+ unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
}
@@ -1376,19 +1372,21 @@ void unmap_vmas(struct mmu_gather *tlb,
*
* Caller must protect the VMA list
*/
-void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size, struct zap_details *details)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
- unsigned long end = address + size;
- unsigned long nr_accounted = 0;
+ unsigned long end = start + size;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
- tlb_finish_mmu(&tlb, address, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+ unmap_single_vma(&tlb, vma, start, end, details);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
}
/**
@@ -1406,13 +1404,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
unsigned long end = address + size;
- unsigned long nr_accounted = 0;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, address, end);
- unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details);
+ unmap_single_vma(&tlb, vma, address, end, details);
mmu_notifier_invalidate_range_end(mm, address, end);
tlb_finish_mmu(&tlb, address, end);
}
@@ -2911,7 +2908,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
- grab_swap_token(mm); /* Contend for token _before_ read-in */
page = swapin_readahead(entry,
GFP_HIGHUSER_MOVABLE, vma, address);
if (!page) {
@@ -2941,6 +2937,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
locked = lock_page_or_retry(page, mm, flags);
+
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
@@ -3489,6 +3486,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
+retry:
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
@@ -3502,13 +3500,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd, flags);
} else {
pmd_t orig_pmd = *pmd;
+ int ret;
+
barrier();
if (pmd_trans_huge(orig_pmd)) {
if (flags & FAULT_FLAG_WRITE &&
!pmd_write(orig_pmd) &&
- !pmd_trans_splitting(orig_pmd))
- return do_huge_pmd_wp_page(mm, vma, address,
- pmd, orig_pmd);
+ !pmd_trans_splitting(orig_pmd)) {
+ ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+ orig_pmd);
+ /*
+ * If COW results in an oom, the huge pmd will
+ * have been split, so retry the fault on the
+ * pte for a smaller charge.
+ */
+ if (unlikely(ret & VM_FAULT_OOM))
+ goto retry;
+ return ret;
+ }
return 0;
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6629fafd6ce..0d7e3ec8e0f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
res->end = start + size - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
if (request_resource(&iomem_resource, res) < 0) {
- printk("System RAM resource %llx - %llx cannot be added\n",
- (unsigned long long)res->start, (unsigned long long)res->end);
+ printk("System RAM resource %pR cannot be added\n", res);
kfree(res);
res = NULL;
}
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
online_pages_range);
if (ret) {
mutex_unlock(&zonelists_mutex);
- printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
- nr_pages, pfn);
+ printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) pfn << PAGE_SHIFT,
+ (((unsigned long long) pfn + nr_pages)
+ << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
unlock_memory_hotplug();
return ret;
@@ -891,7 +892,7 @@ static int __ref offline_pages(unsigned long start_pfn,
nr_pages = end_pfn - start_pfn;
/* set above range as isolated */
- ret = start_isolate_page_range(start_pfn, end_pfn);
+ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
if (ret)
goto out;
@@ -956,7 +957,7 @@ repeat:
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
/* reset pagetype flags and makes migrate type to be MOVABLE */
- undo_isolate_page_range(start_pfn, end_pfn);
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
/* removal success */
zone->present_pages -= offlined_pages;
zone->zone_pgdat->node_present_pages -= offlined_pages;
@@ -977,11 +978,12 @@ repeat:
return 0;
failed_removal:
- printk(KERN_INFO "memory offlining %lx to %lx failed\n",
- start_pfn, end_pfn);
+ printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
- undo_isolate_page_range(start_pfn, end_pfn);
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
out:
unlock_memory_hotplug();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cfb6c867875..f15c1b24ca1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) && step == 0 &&
+ if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -607,27 +607,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return first;
}
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
-{
- int err = 0;
- struct mempolicy *old = vma->vm_policy;
-
- pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
- vma->vm_start, vma->vm_end, vma->vm_pgoff,
- vma->vm_ops, vma->vm_file,
- vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-
- if (vma->vm_ops && vma->vm_ops->set_policy)
- err = vma->vm_ops->set_policy(vma, new);
- if (!err) {
- mpol_get(new);
- vma->vm_policy = new;
- mpol_put(old);
- }
- return err;
-}
-
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
@@ -676,9 +655,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (err)
goto out;
}
- err = policy_vma(vma, new_pol);
- if (err)
- goto out;
+
+ /*
+ * Apply policy to a single VMA. The reference counting of
+ * policy for vma_policy linkages has already been handled by
+ * vma_merge and split_vma as necessary. If this is a shared
+ * policy then ->set_policy will increment the reference count
+ * for an sp node.
+ */
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_ops, vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+ if (vma->vm_ops && vma->vm_ops->set_policy) {
+ err = vma->vm_ops->set_policy(vma, new_pol);
+ if (err)
+ goto out;
+ }
}
out:
@@ -957,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
*
* Returns the number of page that could not be moved.
*/
-int do_migrate_pages(struct mm_struct *mm,
- const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to, int flags)
{
int busy = 0;
int err;
@@ -970,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm,
down_read(&mm->mmap_sem);
- err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+ err = migrate_vmas(mm, from, to, flags);
if (err)
goto out;
@@ -1005,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm,
* moved to an empty node, then there is nothing left worth migrating.
*/
- tmp = *from_nodes;
+ tmp = *from;
while (!nodes_empty(tmp)) {
int s,d;
int source = -1;
int dest = 0;
for_each_node_mask(s, tmp) {
- d = node_remap(s, *from_nodes, *to_nodes);
+
+ /*
+ * do_migrate_pages() tries to maintain the relative
+ * node relationship of the pages established between
+ * threads and memory areas.
+ *
+ * However if the number of source nodes is not equal to
+ * the number of destination nodes we can not preserve
+ * this node relative relationship. In that case, skip
+ * copying memory from a node that is in the destination
+ * mask.
+ *
+ * Example: [2,3,4] -> [3,4,5] moves everything.
+ * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
+ */
+
+ if ((nodes_weight(*from) != nodes_weight(*to)) &&
+ (node_isset(s, *to)))
+ continue;
+
+ d = node_remap(s, *from, *to);
if (s == d)
continue;
@@ -1072,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
{
}
-int do_migrate_pages(struct mm_struct *mm,
- const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to, int flags)
{
return -ENOSYS;
}
@@ -1334,8 +1347,8 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
* userid as the target process.
*/
tcred = __task_cred(task);
- if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
- cred->uid != tcred->suid && cred->uid != tcred->uid &&
+ if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
!capable(CAP_SYS_NICE)) {
rcu_read_unlock();
err = -EPERM;
@@ -1361,11 +1374,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
mm = get_task_mm(task);
put_task_struct(task);
- if (mm)
- err = do_migrate_pages(mm, old, new,
- capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
- else
+
+ if (!mm) {
err = -EINVAL;
+ goto out;
+ }
+
+ err = do_migrate_pages(mm, old, new,
+ capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
mmput(mm);
out:
diff --git a/mm/migrate.c b/mm/migrate.c
index 51c08a0c6f6..ab81d482ae6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1371,8 +1371,8 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
* userid as the target process.
*/
tcred = __task_cred(task);
- if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
- cred->uid != tcred->suid && cred->uid != tcred->uid &&
+ if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
!capable(CAP_SYS_NICE)) {
rcu_read_unlock();
err = -EPERM;
@@ -1388,14 +1388,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
mm = get_task_mm(task);
put_task_struct(task);
- if (mm) {
- if (nodes)
- err = do_pages_move(mm, task_nodes, nr_pages, pages,
- nodes, status, flags);
- else
- err = do_pages_stat(mm, nr_pages, pages, status);
- } else
- err = -EINVAL;
+ if (!mm)
+ return -EINVAL;
+
+ if (nodes)
+ err = do_pages_move(mm, task_nodes, nr_pages, pages,
+ nodes, status, flags);
+ else
+ err = do_pages_stat(mm, nr_pages, pages, status);
mmput(mm);
return err;
diff --git a/mm/mmap.c b/mm/mmap.c
index a7bf6a31c9f..3edfcdfa42d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,7 @@
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
+#include <linux/uprobes.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -240,6 +241,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
return next;
}
+static unsigned long do_brk(unsigned long addr, unsigned long len);
+
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long rlim, retval;
@@ -544,8 +547,15 @@ again: remove_next = 1 + (end > next->vm_end);
if (file) {
mapping = file->f_mapping;
- if (!(vma->vm_flags & VM_NONLINEAR))
+ if (!(vma->vm_flags & VM_NONLINEAR)) {
root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+
+ if (adjust_next)
+ uprobe_munmap(next, next->vm_start,
+ next->vm_end);
+ }
+
mutex_lock(&mapping->i_mmap_mutex);
if (insert) {
/*
@@ -615,8 +625,16 @@ again: remove_next = 1 + (end > next->vm_end);
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
+ if (root) {
+ uprobe_mmap(vma);
+
+ if (adjust_next)
+ uprobe_mmap(next);
+ }
+
if (remove_next) {
if (file) {
+ uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
if (next->vm_flags & VM_EXECUTABLE)
removed_exe_file_vma(mm);
@@ -636,6 +654,8 @@ again: remove_next = 1 + (end > next->vm_end);
goto again;
}
}
+ if (insert && file)
+ uprobe_mmap(insert);
validate_mm(mm);
@@ -958,8 +978,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
struct mm_struct * mm = current->mm;
struct inode *inode;
vm_flags_t vm_flags;
- int error;
- unsigned long reqprot = prot;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1081,13 +1099,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
}
}
- error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
- if (error)
- return error;
-
return mmap_region(file, addr, len, flags, vm_flags, pgoff);
}
-EXPORT_SYMBOL(do_mmap_pgoff);
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
@@ -1120,10 +1133,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- down_write(&current->mm->mmap_sem);
- retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(&current->mm->mmap_sem);
-
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
out:
@@ -1344,6 +1354,11 @@ out:
mm->locked_vm += (len >> PAGE_SHIFT);
} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
make_pages_present(addr, addr + len);
+
+ if (file && uprobe_mmap(vma))
+ /* matching probes but cannot insert */
+ goto unmap_and_free_vma;
+
return addr;
unmap_and_free_vma:
@@ -1579,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (addr & ~PAGE_MASK)
return -EINVAL;
- return arch_rebalance_pgtables(addr, len);
+ addr = arch_rebalance_pgtables(addr, len);
+ error = security_mmap_addr(addr);
+ return error ? error : addr;
}
EXPORT_SYMBOL(get_unmapped_area);
@@ -1589,33 +1606,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = NULL;
- if (mm) {
- /* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = mm->mmap_cache;
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node * rb_node;
-
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
-
- while (rb_node) {
- struct vm_area_struct * vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
+ if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
+ return NULL;
+
+ /* Check the cache first. */
+ /* (Cache hit rate is typically around 35%.) */
+ vma = mm->mmap_cache;
+ if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+ struct rb_node *rb_node;
+
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
+
+ while (rb_node) {
+ struct vm_area_struct *vma_tmp;
+
+ vma_tmp = rb_entry(rb_node,
+ struct vm_area_struct, vm_rb);
+
+ if (vma_tmp->vm_end > addr) {
+ vma = vma_tmp;
+ if (vma_tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
+ if (vma)
+ mm->mmap_cache = vma;
}
return vma;
}
@@ -1768,7 +1786,7 @@ int expand_downwards(struct vm_area_struct *vma,
return -ENOMEM;
address &= PAGE_MASK;
- error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+ error = security_mmap_addr(address);
if (error)
return error;
@@ -1862,15 +1880,20 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
*/
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
+ unsigned long nr_accounted = 0;
+
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
do {
long nrpages = vma_pages(vma);
+ if (vma->vm_flags & VM_ACCOUNT)
+ nr_accounted += nrpages;
mm->total_vm -= nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
vma = remove_vma(vma);
} while (vma);
+ vm_unacct_memory(nr_accounted);
validate_mm(mm);
}
@@ -1885,13 +1908,11 @@ static void unmap_region(struct mm_struct *mm,
{
struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
struct mmu_gather tlb;
- unsigned long nr_accounted = 0;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
- vm_unacct_memory(nr_accounted);
+ unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : 0);
tlb_finish_mmu(&tlb, start, end);
@@ -2106,20 +2127,23 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return 0;
}
-EXPORT_SYMBOL(do_munmap);
-
-SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+int vm_munmap(unsigned long start, size_t len)
{
int ret;
struct mm_struct *mm = current->mm;
- profile_munmap(addr);
-
down_write(&mm->mmap_sem);
- ret = do_munmap(mm, addr, len);
+ ret = do_munmap(mm, start, len);
up_write(&mm->mmap_sem);
return ret;
}
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+ profile_munmap(addr);
+ return vm_munmap(addr, len);
+}
static inline void verify_mm_writelocked(struct mm_struct *mm)
{
@@ -2136,7 +2160,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
-unsigned long do_brk(unsigned long addr, unsigned long len)
+static unsigned long do_brk(unsigned long addr, unsigned long len)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma, * prev;
@@ -2149,10 +2173,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if (!len)
return addr;
- error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
- if (error)
- return error;
-
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
@@ -2232,7 +2252,17 @@ out:
return addr;
}
-EXPORT_SYMBOL(do_brk);
+unsigned long vm_brk(unsigned long addr, unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long ret;
+
+ down_write(&mm->mmap_sem);
+ ret = do_brk(addr, len);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(vm_brk);
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
@@ -2264,8 +2294,7 @@ void exit_mmap(struct mm_struct *mm)
tlb_gather_mmu(&tlb, mm, 1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
- vm_unacct_memory(nr_accounted);
+ unmap_vmas(&tlb, vma, 0, -1);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(&tlb, 0, -1);
@@ -2274,8 +2303,12 @@ void exit_mmap(struct mm_struct *mm)
* Walk the list again, actually closing and freeing it,
* with preemption enabled, without holding any MM locks.
*/
- while (vma)
+ while (vma) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
+ }
+ vm_unacct_memory(nr_accounted);
BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
@@ -2311,6 +2344,10 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
+
+ if (vma->vm_file && uprobe_mmap(vma))
+ return -EINVAL;
+
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
}
@@ -2380,6 +2417,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma->vm_pgoff = pgoff;
if (new_vma->vm_file) {
get_file(new_vma->vm_file);
+
+ if (uprobe_mmap(new_vma))
+ goto out_free_mempol;
+
if (vma->vm_flags & VM_EXECUTABLE)
added_exe_file_vma(mm);
}
@@ -2484,10 +2525,6 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_ops = &special_mapping_vmops;
vma->vm_private_data = pages;
- ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
- if (ret)
- goto out;
-
ret = insert_vm_struct(mm, vma);
if (ret)
goto out;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7cf7b7ddc7c..6830eab5bf0 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -86,3 +86,17 @@ int memmap_valid_within(unsigned long pfn,
return 1;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
+void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+{
+ enum lru_list lru;
+
+ memset(lruvec, 0, sizeof(struct lruvec));
+
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&lruvec->lists[lru]);
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+ lruvec->zone = zone;
+#endif
+}
diff --git a/mm/mremap.c b/mm/mremap.c
index db8d983b5a7..21fed202dda 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -371,10 +371,6 @@ static unsigned long mremap_to(unsigned long addr,
if ((addr <= new_addr) && (addr+old_len) > new_addr)
goto out;
- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
- if (ret)
- goto out;
-
ret = do_munmap(mm, new_addr, new_len);
if (ret)
goto out;
@@ -432,15 +428,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
* MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
* This option implies MREMAP_MAYMOVE.
*/
-unsigned long do_mremap(unsigned long addr,
- unsigned long old_len, unsigned long new_len,
- unsigned long flags, unsigned long new_addr)
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
+ down_write(&current->mm->mmap_sem);
+
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
goto out;
@@ -530,25 +528,11 @@ unsigned long do_mremap(unsigned long addr,
goto out;
}
- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
- if (ret)
- goto out;
ret = move_vma(vma, addr, old_len, new_len, new_addr);
}
out:
if (ret & ~PAGE_MASK)
vm_unacct_memory(charged);
- return ret;
-}
-
-SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
- unsigned long, new_len, unsigned long, flags,
- unsigned long, new_addr)
-{
- unsigned long ret;
-
- down_write(&current->mm->mmap_sem);
- ret = do_mremap(addr, old_len, new_len, flags, new_addr);
up_write(&current->mm->mmap_sem);
return ret;
}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 24f0fc1a56d..d23415c001b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -82,8 +82,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
- int i;
- unsigned long start_aligned, end_aligned;
+ unsigned long i, start_aligned, end_aligned;
int order = ilog2(BITS_PER_LONG);
start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
@@ -275,6 +274,57 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, limit);
}
+static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+again:
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, limit);
+ if (ptr)
+ return ptr;
+
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, limit);
+ if (ptr)
+ return ptr;
+
+ if (goal) {
+ goal = 0;
+ goto again;
+ }
+
+ return NULL;
+}
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+ if (ptr)
+ return ptr;
+
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
+}
+
/**
* __alloc_bootmem_node - allocate boot memory from a specific node
* @pgdat: node to allocate from
@@ -293,18 +343,10 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- void *ptr;
-
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, -1ULL);
- if (ptr)
- return ptr;
-
- return __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, -1ULL);
+ return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
}
void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -313,44 +355,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
return __alloc_bootmem_node(pgdat, size, align, goal);
}
-#ifdef CONFIG_SPARSEMEM
-/**
- * alloc_bootmem_section - allocate boot memory from a specific section
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
- unsigned long section_nr)
-{
- unsigned long pfn, goal, limit;
-
- pfn = section_nr_to_pfn(section_nr);
- goal = pfn << PAGE_SHIFT;
- limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-
- return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
- SMP_CACHE_BYTES, goal, limit);
-}
-#endif
-
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
- void *ptr;
-
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, -1ULL);
- if (ptr)
- return ptr;
-
- return __alloc_bootmem_nopanic(size, align, goal);
-}
-
#ifndef ARCH_LOW_ADDRESS_LIMIT
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
#endif
@@ -392,16 +396,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- void *ptr;
-
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, ARCH_LOW_ADDRESS_LIMIT);
- if (ptr)
- return ptr;
-
- return __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, ARCH_LOW_ADDRESS_LIMIT);
+ return ___alloc_bootmem_node(pgdat, size, align, goal,
+ ARCH_LOW_ADDRESS_LIMIT);
}
diff --git a/mm/nommu.c b/mm/nommu.c
index f59e170fceb..c4acfbc0997 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file,
unsigned long *_capabilities)
{
unsigned long capabilities, rlen;
- unsigned long reqprot = prot;
int ret;
/* do the simple checks first */
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file,
}
/* allow the security API to have its say */
- ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+ ret = security_mmap_addr(addr);
if (ret < 0)
return ret;
@@ -1470,7 +1469,6 @@ error_getting_region:
show_free_areas(0);
return -ENOMEM;
}
-EXPORT_SYMBOL(do_mmap_pgoff);
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
@@ -1488,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- down_write(&current->mm->mmap_sem);
- retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(&current->mm->mmap_sem);
+ ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
@@ -1709,16 +1705,22 @@ erase_whole_vma:
}
EXPORT_SYMBOL(do_munmap);
-SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+int vm_munmap(unsigned long addr, size_t len)
{
- int ret;
struct mm_struct *mm = current->mm;
+ int ret;
down_write(&mm->mmap_sem);
ret = do_munmap(mm, addr, len);
up_write(&mm->mmap_sem);
return ret;
}
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+ return vm_munmap(addr, len);
+}
/*
* release all the mappings made in a process's VM space
@@ -1744,7 +1746,7 @@ void exit_mmap(struct mm_struct *mm)
kleave("");
}
-unsigned long do_brk(unsigned long addr, unsigned long len)
+unsigned long vm_brk(unsigned long addr, unsigned long len)
{
return -ENOMEM;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 46bf2ed5594..ed0e1967736 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p,
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
- const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
+ const nodemask_t *nodemask, unsigned long totalpages)
{
- long points;
+ unsigned long points;
if (oom_unkillable_task(p, memcg, nodemask))
return 0;
@@ -198,21 +198,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
}
/*
- * The memory controller may have a limit of 0 bytes, so avoid a divide
- * by zero, if necessary.
- */
- if (!totalpages)
- totalpages = 1;
-
- /*
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
- points = get_mm_rss(p->mm) + p->mm->nr_ptes;
- points += get_mm_counter(p->mm, MM_SWAPENTS);
-
- points *= 1000;
- points /= totalpages;
+ points = get_mm_rss(p->mm) + p->mm->nr_ptes +
+ get_mm_counter(p->mm, MM_SWAPENTS);
task_unlock(p);
/*
@@ -220,23 +210,20 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- points -= 30;
+ points -= 30 * totalpages / 1000;
/*
* /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
* either completely disable oom killing or always prefer a certain
* task.
*/
- points += p->signal->oom_score_adj;
+ points += p->signal->oom_score_adj * totalpages / 1000;
/*
- * Never return 0 for an eligible task that may be killed since it's
- * possible that no single user task uses more than 0.1% of memory and
- * no single admin tasks uses more than 3.0%.
+ * Never return 0 for an eligible task regardless of the root bonus and
+ * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
*/
- if (points <= 0)
- return 1;
- return (points < 1000) ? points : 1000;
+ return points ? points : 1;
}
/*
@@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
- *ppoints = 0;
+ unsigned long chosen_points = 0;
do_each_thread(g, p) {
unsigned int points;
@@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
*/
if (p == current) {
chosen = p;
- *ppoints = 1000;
+ chosen_points = ULONG_MAX;
} else if (!force_kill) {
/*
* If this task is not being ptraced on exit,
@@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
}
points = oom_badness(p, memcg, nodemask, totalpages);
- if (points > *ppoints) {
+ if (points > chosen_points) {
chosen = p;
- *ppoints = points;
+ chosen_points = points;
}
} while_each_thread(g, p);
+ *ppoints = chosen_points * 1000 / totalpages;
return chosen;
}
@@ -410,8 +398,8 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
}
pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
- task->pid, task_uid(task), task->tgid,
- task->mm->total_vm, get_mm_rss(task->mm),
+ task->pid, from_kuid(&init_user_ns, task_uid(task)),
+ task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
task_cpu(task), task->signal->oom_adj,
task->signal->oom_score_adj, task->comm);
task_unlock(task);
@@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
}
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
- limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
+ limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
read_lock(&tasklist_lock);
p = select_bad_process(&points, limit, memcg, NULL, false);
if (p && PTR_ERR(p) != -1UL)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26adea8ca2e..93d8d2f7108 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -204,7 +204,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
* Returns the global number of pages potentially available for dirty
* page cache. This is the base value for the global dirty limits.
*/
-unsigned long global_dirtyable_memory(void)
+static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
@@ -1568,6 +1568,7 @@ void writeback_set_ratelimit(void)
unsigned long background_thresh;
unsigned long dirty_thresh;
global_dirty_limits(&background_thresh, &dirty_thresh);
+ global_dirty_limit = dirty_thresh;
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a712fb9e04c..6092f331b32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
#include <linux/ftrace_event.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/migrate.h>
#include <linux/page-debug-flags.h>
#include <asm/tlbflush.h>
@@ -218,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled))
@@ -513,10 +514,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* free pages of length of (1 << order) and marked with _mapcount -2. Page's
* order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
- * other. That is, if we allocate a small block, and both were
- * free, the remainder of the region must be split into blocks.
+ * other. That is, if we allocate a small block, and both were
+ * free, the remainder of the region must be split into blocks.
* If a block is freed, and its buddy is also free, then this
- * triggers coalescing into a block of larger size.
+ * triggers coalescing into a block of larger size.
*
* -- wli
*/
@@ -749,6 +750,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
__free_pages(page, order);
}
+#ifdef CONFIG_CMA
+/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
+void __init init_cma_reserved_pageblock(struct page *page)
+{
+ unsigned i = pageblock_nr_pages;
+ struct page *p = page;
+
+ do {
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ } while (++p, --i);
+
+ set_page_refcounted(page);
+ set_pageblock_migratetype(page, MIGRATE_CMA);
+ __free_pages(page, pageblock_order);
+ totalram_pages += pageblock_nr_pages;
+}
+#endif
/*
* The order of subdivision here is critical for the IO subsystem.
@@ -874,11 +893,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
-static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
- [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
- [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
+static int fallbacks[MIGRATE_TYPES][4] = {
+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+#ifdef CONFIG_CMA
+ [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
+#else
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+#endif
+ [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
+ [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
};
/*
@@ -929,8 +954,8 @@ static int move_freepages(struct zone *zone,
return pages_moved;
}
-static int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype)
+int move_freepages_block(struct zone *zone, struct page *page,
+ int migratetype)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
@@ -973,12 +998,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1; current_order >= order;
--current_order) {
- for (i = 0; i < MIGRATE_TYPES - 1; i++) {
+ for (i = 0;; i++) {
migratetype = fallbacks[start_migratetype][i];
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
- continue;
+ break;
area = &(zone->free_area[current_order]);
if (list_empty(&area->free_list[migratetype]))
@@ -993,11 +1018,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
* pages to the preferred allocation list. If falling
* back for a reclaimable kernel allocation, be more
* aggressive about taking ownership of free pages
+ *
+ * On the other hand, never change migration
+ * type of MIGRATE_CMA pageblocks nor move CMA
+ * pages on different free lists. We don't
+ * want unmovable pages to be allocated from
+ * MIGRATE_CMA areas.
*/
- if (unlikely(current_order >= (pageblock_order >> 1)) ||
- start_migratetype == MIGRATE_RECLAIMABLE ||
- page_group_by_mobility_disabled) {
- unsigned long pages;
+ if (!is_migrate_cma(migratetype) &&
+ (unlikely(current_order >= pageblock_order / 2) ||
+ start_migratetype == MIGRATE_RECLAIMABLE ||
+ page_group_by_mobility_disabled)) {
+ int pages;
pages = move_freepages_block(zone, page,
start_migratetype);
@@ -1015,11 +1047,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
rmv_page_order(page);
/* Take ownership for orders >= pageblock_order */
- if (current_order >= pageblock_order)
+ if (current_order >= pageblock_order &&
+ !is_migrate_cma(migratetype))
change_pageblock_range(page, current_order,
start_migratetype);
- expand(zone, page, order, current_order, area, migratetype);
+ expand(zone, page, order, current_order, area,
+ is_migrate_cma(migratetype)
+ ? migratetype : start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype);
@@ -1061,17 +1096,17 @@ retry_reserve:
return page;
}
-/*
+/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
-static int rmqueue_bulk(struct zone *zone, unsigned int order,
+static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, int cold)
{
- int i;
-
+ int mt = migratetype, i;
+
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype);
@@ -1091,7 +1126,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
- set_page_private(page, migratetype);
+ if (IS_ENABLED(CONFIG_CMA)) {
+ mt = get_pageblock_migratetype(page);
+ if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
+ mt = migratetype;
+ }
+ set_page_private(page, mt);
list = &page->lru;
}
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
@@ -1371,8 +1411,12 @@ int split_free_page(struct page *page)
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
- for (; page < endpage; page += pageblock_nr_pages)
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ for (; page < endpage; page += pageblock_nr_pages) {
+ int mt = get_pageblock_migratetype(page);
+ if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
+ set_pageblock_migratetype(page,
+ MIGRATE_MOVABLE);
+ }
}
return 1 << order;
@@ -2086,16 +2130,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
#endif /* CONFIG_COMPACTION */
-/* The really slow allocator path where we enter direct reclaim */
-static inline struct page *
-__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+/* Perform direct synchronous page reclaim */
+static int
+__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
+ nodemask_t *nodemask)
{
- struct page *page = NULL;
struct reclaim_state reclaim_state;
- bool drained = false;
+ int progress;
cond_resched();
@@ -2106,7 +2147,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
- *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+ progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
@@ -2114,6 +2155,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
cond_resched();
+ return progress;
+}
+
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int migratetype, unsigned long *did_some_progress)
+{
+ struct page *page = NULL;
+ bool drained = false;
+
+ *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+ nodemask);
if (unlikely(!(*did_some_progress)))
return NULL;
@@ -4244,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat,
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-/* Return a sensible default order for the pageblock size. */
-static inline int pageblock_default_order(void)
-{
- if (HPAGE_SHIFT > PAGE_SHIFT)
- return HUGETLB_PAGE_ORDER;
-
- return MAX_ORDER-1;
-}
-
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(unsigned int order)
+static inline void __init set_pageblock_order(void)
{
+ unsigned int order;
+
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
return;
+ if (HPAGE_SHIFT > PAGE_SHIFT)
+ order = HUGETLB_PAGE_ORDER;
+ else
+ order = MAX_ORDER - 1;
+
/*
* Assume the largest contiguous order of interest is a huge page.
- * This value may be variable depending on boot parameters on IA64
+ * This value may be variable depending on boot parameters on IA64 and
+ * powerpc.
*/
pageblock_order = order;
}
@@ -4270,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order)
/*
* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * and pageblock_default_order() are unused as pageblock_order is set
- * at compile-time. See include/linux/pageblock-flags.h for the values of
- * pageblock_order based on the kernel config
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
*/
-static inline int pageblock_default_order(unsigned int order)
+static inline void set_pageblock_order(void)
{
- return MAX_ORDER-1;
}
-#define set_pageblock_order(x) do {} while (0)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
@@ -4301,11 +4354,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);
-
+
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
- enum lru_list lru;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4355,18 +4407,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- for_each_lru(lru)
- INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
- zone->reclaim_stat.recent_rotated[0] = 0;
- zone->reclaim_stat.recent_rotated[1] = 0;
- zone->reclaim_stat.recent_scanned[0] = 0;
- zone->reclaim_stat.recent_scanned[1] = 0;
+ lruvec_init(&zone->lruvec, zone);
zap_zone_vm_stats(zone);
zone->flags = 0;
if (!size)
continue;
- set_pageblock_order(pageblock_default_order());
+ set_pageblock_order();
setup_usemap(pgdat, zone, size);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
@@ -4759,31 +4806,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
- printk("Zone PFN ranges:\n");
+ printk("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
- printk(" %-8s ", zone_names[i]);
+ printk(KERN_CONT " %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
- printk("empty\n");
+ printk(KERN_CONT "empty\n");
else
- printk("%0#10lx -> %0#10lx\n",
- arch_zone_lowest_possible_pfn[i],
- arch_zone_highest_possible_pfn[i]);
+ printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
+ arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
+ (arch_zone_highest_possible_pfn[i]
+ << PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
- printk("Movable zone start PFN for each node\n");
+ printk("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
- printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
+ printk(" Node %d: %#010lx\n", i,
+ zone_movable_pfn[i] << PAGE_SHIFT);
}
/* Print out the early_node_map[] */
- printk("Early memory PFN ranges\n");
+ printk("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
- printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
+ printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
+ start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
/* Initialise every node */
mminit_verify_pageflags_layout();
@@ -4976,14 +5026,7 @@ static void setup_per_zone_lowmem_reserve(void)
calculate_totalreserve_pages();
}
-/**
- * setup_per_zone_wmarks - called when min_free_kbytes changes
- * or when memory is hot-{added|removed}
- *
- * Ensures that the watermark[min,low,high] values for each zone are set
- * correctly with respect to min_free_kbytes.
- */
-void setup_per_zone_wmarks(void)
+static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
@@ -5030,6 +5073,11 @@ void setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+
+ zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
+ zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
+ zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
+
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -5038,6 +5086,20 @@ void setup_per_zone_wmarks(void)
calculate_totalreserve_pages();
}
+/**
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-{added|removed}
+ *
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
+ */
+void setup_per_zone_wmarks(void)
+{
+ mutex_lock(&zonelists_mutex);
+ __setup_per_zone_wmarks();
+ mutex_unlock(&zonelists_mutex);
+}
+
/*
* The inactive anon list should be small enough that the VM never has to
* do too much work, but large enough that each inactive page has a chance
@@ -5203,7 +5265,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
int ret;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret == -EINVAL))
+ if (!write || (ret < 0))
return ret;
for_each_populated_zone(zone) {
for_each_possible_cpu(cpu) {
@@ -5242,9 +5304,10 @@ void *__init alloc_large_system_hash(const char *tablename,
int flags,
unsigned int *_hash_shift,
unsigned int *_hash_mask,
- unsigned long limit)
+ unsigned long low_limit,
+ unsigned long high_limit)
{
- unsigned long long max = limit;
+ unsigned long long max = high_limit;
unsigned long log2qty, size;
void *table = NULL;
@@ -5282,6 +5345,8 @@ void *__init alloc_large_system_hash(const char *tablename,
}
max = min(max, 0x80000000ULL);
+ if (numentries < low_limit)
+ numentries = low_limit;
if (numentries > max)
numentries = max;
@@ -5412,14 +5477,16 @@ static int
__count_immobile_pages(struct zone *zone, struct page *page, int count)
{
unsigned long pfn, iter, found;
+ int mt;
+
/*
* For avoiding noise data, lru_add_drain_all() should be called
* If ZONE_MOVABLE, the zone never contains immobile pages
*/
if (zone_idx(zone) == ZONE_MOVABLE)
return true;
-
- if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+ mt = get_pageblock_migratetype(page);
+ if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
return true;
pfn = page_to_pfn(page);
@@ -5536,7 +5603,7 @@ out:
return ret;
}
-void unset_migratetype_isolate(struct page *page)
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
unsigned long flags;
@@ -5544,12 +5611,259 @@ void unset_migratetype_isolate(struct page *page)
spin_lock_irqsave(&zone->lock, flags);
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
goto out;
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- move_freepages_block(zone, page, MIGRATE_MOVABLE);
+ set_pageblock_migratetype(page, migratetype);
+ move_freepages_block(zone, page, migratetype);
out:
spin_unlock_irqrestore(&zone->lock, flags);
}
+#ifdef CONFIG_CMA
+
+static unsigned long pfn_max_align_down(unsigned long pfn)
+{
+ return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ pageblock_nr_pages) - 1);
+}
+
+static unsigned long pfn_max_align_up(unsigned long pfn)
+{
+ return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ pageblock_nr_pages));
+}
+
+static struct page *
+__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
+ int **resultp)
+{
+ return alloc_page(GFP_HIGHUSER_MOVABLE);
+}
+
+/* [start, end) must belong to a single zone. */
+static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+{
+ /* This function is based on compact_zone() from compaction.c. */
+
+ unsigned long pfn = start;
+ unsigned int tries = 0;
+ int ret = 0;
+
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(start)),
+ .mode = COMPACT_SYNC,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ migrate_prep_local();
+
+ while (pfn < end || !list_empty(&cc.migratepages)) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (list_empty(&cc.migratepages)) {
+ cc.nr_migratepages = 0;
+ pfn = isolate_migratepages_range(cc.zone, &cc,
+ pfn, end);
+ if (!pfn) {
+ ret = -EINTR;
+ break;
+ }
+ tries = 0;
+ } else if (++tries == 5) {
+ ret = ret < 0 ? ret : -EBUSY;
+ break;
+ }
+
+ ret = migrate_pages(&cc.migratepages,
+ __alloc_contig_migrate_alloc,
+ 0, false, MIGRATE_SYNC);
+ }
+
+ putback_lru_pages(&cc.migratepages);
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Update zone's cma pages counter used for watermark level calculation.
+ */
+static inline void __update_cma_watermarks(struct zone *zone, int count)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->min_cma_pages += count;
+ spin_unlock_irqrestore(&zone->lock, flags);
+ setup_per_zone_wmarks();
+}
+
+/*
+ * Trigger memory pressure bump to reclaim some pages in order to be able to
+ * allocate 'count' pages in single page units. Does similar work as
+ *__alloc_pages_slowpath() function.
+ */
+static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
+{
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ struct zonelist *zonelist = node_zonelist(0, gfp_mask);
+ int did_some_progress = 0;
+ int order = 1;
+
+ /*
+ * Increase level of watermarks to force kswapd do his job
+ * to stabilise at new watermark level.
+ */
+ __update_cma_watermarks(zone, count);
+
+ /* Obey watermarks as if the page was being allocated */
+ while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
+ wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
+
+ did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+ NULL);
+ if (!did_some_progress) {
+ /* Exhausted what can be done so it's blamo time */
+ out_of_memory(zonelist, gfp_mask, order, NULL, false);
+ }
+ }
+
+ /* Restore original watermark levels. */
+ __update_cma_watermarks(zone, -count);
+
+ return count;
+}
+
+/**
+ * alloc_contig_range() -- tries to allocate given range of pages
+ * @start: start PFN to allocate
+ * @end: one-past-the-last PFN to allocate
+ * @migratetype: migratetype of the underlaying pageblocks (either
+ * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
+ * in range must have the same migratetype and it must
+ * be either of the two.
+ *
+ * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
+ * aligned, however it's the caller's responsibility to guarantee that
+ * we are the only thread that changes migrate type of pageblocks the
+ * pages fall in.
+ *
+ * The PFN range must belong to a single zone.
+ *
+ * Returns zero on success or negative error code. On success all
+ * pages which PFN is in [start, end) are allocated for the caller and
+ * need to be freed with free_contig_range().
+ */
+int alloc_contig_range(unsigned long start, unsigned long end,
+ unsigned migratetype)
+{
+ struct zone *zone = page_zone(pfn_to_page(start));
+ unsigned long outer_start, outer_end;
+ int ret = 0, order;
+
+ /*
+ * What we do here is we mark all pageblocks in range as
+ * MIGRATE_ISOLATE. Because pageblock and max order pages may
+ * have different sizes, and due to the way page allocator
+ * work, we align the range to biggest of the two pages so
+ * that page allocator won't try to merge buddies from
+ * different pageblocks and change MIGRATE_ISOLATE to some
+ * other migration type.
+ *
+ * Once the pageblocks are marked as MIGRATE_ISOLATE, we
+ * migrate the pages from an unaligned range (ie. pages that
+ * we are interested in). This will put all the pages in
+ * range back to page allocator as MIGRATE_ISOLATE.
+ *
+ * When this is done, we take the pages in range from page
+ * allocator removing them from the buddy system. This way
+ * page allocator will never consider using them.
+ *
+ * This lets us mark the pageblocks back as
+ * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
+ * aligned range but not in the unaligned, original range are
+ * put back to page allocator so that buddy can use them.
+ */
+
+ ret = start_isolate_page_range(pfn_max_align_down(start),
+ pfn_max_align_up(end), migratetype);
+ if (ret)
+ goto done;
+
+ ret = __alloc_contig_migrate_range(start, end);
+ if (ret)
+ goto done;
+
+ /*
+ * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+ * aligned blocks that are marked as MIGRATE_ISOLATE. What's
+ * more, all pages in [start, end) are free in page allocator.
+ * What we are going to do is to allocate all pages from
+ * [start, end) (that is remove them from page allocator).
+ *
+ * The only problem is that pages at the beginning and at the
+ * end of interesting range may be not aligned with pages that
+ * page allocator holds, ie. they can be part of higher order
+ * pages. Because of this, we reserve the bigger range and
+ * once this is done free the pages we are not interested in.
+ *
+ * We don't have to hold zone->lock here because the pages are
+ * isolated thus they won't get removed from buddy.
+ */
+
+ lru_add_drain_all();
+ drain_all_pages();
+
+ order = 0;
+ outer_start = start;
+ while (!PageBuddy(pfn_to_page(outer_start))) {
+ if (++order >= MAX_ORDER) {
+ ret = -EBUSY;
+ goto done;
+ }
+ outer_start &= ~0UL << order;
+ }
+
+ /* Make sure the range is really isolated. */
+ if (test_pages_isolated(outer_start, end)) {
+ pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
+ outer_start, end);
+ ret = -EBUSY;
+ goto done;
+ }
+
+ /*
+ * Reclaim enough pages to make sure that contiguous allocation
+ * will not starve the system.
+ */
+ __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
+
+ /* Grab isolated pages from freelists. */
+ outer_end = isolate_freepages_range(outer_start, end);
+ if (!outer_end) {
+ ret = -EBUSY;
+ goto done;
+ }
+
+ /* Free head and tail (if any) */
+ if (start != outer_start)
+ free_contig_range(outer_start, start - outer_start);
+ if (end != outer_end)
+ free_contig_range(end, outer_end - end);
+
+done:
+ undo_isolate_page_range(pfn_max_align_down(start),
+ pfn_max_align_up(end), migratetype);
+ return ret;
+}
+
+void free_contig_range(unsigned long pfn, unsigned nr_pages)
+{
+ for (; nr_pages--; ++pfn)
+ __free_page(pfn_to_page(pfn));
+}
+#endif
+
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* All pages in the range must be isolated before calling this.
@@ -5618,7 +5932,7 @@ bool is_free_buddy_page(struct page *page)
}
#endif
-static struct trace_print_flags pageflag_names[] = {
+static const struct trace_print_flags pageflag_names[] = {
{1UL << PG_locked, "locked" },
{1UL << PG_error, "error" },
{1UL << PG_referenced, "referenced" },
@@ -5653,7 +5967,9 @@ static struct trace_print_flags pageflag_names[] = {
#ifdef CONFIG_MEMORY_FAILURE
{1UL << PG_hwpoison, "hwpoison" },
#endif
- {-1UL, NULL },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ {1UL << PG_compound_lock, "compound_lock" },
+#endif
};
static void dump_page_flags(unsigned long flags)
@@ -5662,12 +5978,14 @@ static void dump_page_flags(unsigned long flags)
unsigned long mask;
int i;
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
+
printk(KERN_ALERT "page flags: %#lx(", flags);
/* remove zone id */
flags &= (1UL << NR_PAGEFLAGS) - 1;
- for (i = 0; pageflag_names[i].name && flags; i++) {
+ for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
mask = pageflag_names[i].mask;
if ((flags & mask) != mask)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 4ae42bb4089..c9f04774f2b 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -24,6 +24,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* to be MIGRATE_ISOLATE.
* @start_pfn: The lower PFN of the range to be isolated.
* @end_pfn: The upper PFN of the range to be isolated.
+ * @migratetype: migrate type to set in error recovery.
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
@@ -32,8 +33,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* start_pfn/end_pfn must be aligned to pageblock_order.
* Returns 0 on success and -EBUSY if any part of range cannot be isolated.
*/
-int
-start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned migratetype)
{
unsigned long pfn;
unsigned long undo_pfn;
@@ -56,7 +57,7 @@ undo:
for (pfn = start_pfn;
pfn < undo_pfn;
pfn += pageblock_nr_pages)
- unset_migratetype_isolate(pfn_to_page(pfn));
+ unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
return -EBUSY;
}
@@ -64,8 +65,8 @@ undo:
/*
* Make isolated pages available again.
*/
-int
-undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned migratetype)
{
unsigned long pfn;
struct page *page;
@@ -77,7 +78,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
page = __first_valid_page(pfn, pageblock_nr_pages);
if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
continue;
- unset_migratetype_isolate(page);
+ unset_migratetype_isolate(page, migratetype);
}
return 0;
}
@@ -86,7 +87,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
* all pages in [start_pfn...end_pfn) must be in the same zone.
* zone->lock must be held before call this.
*
- * Returns 1 if all pages in the range is isolated.
+ * Returns 1 if all pages in the range are isolated.
*/
static int
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
diff --git a/mm/percpu.c b/mm/percpu.c
index f47af9123af..bb4be7435ce 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1132,20 +1132,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
for (alloc_end += gi->nr_units / upa;
alloc < alloc_end; alloc++) {
if (!(alloc % apl)) {
- printk("\n");
+ printk(KERN_CONT "\n");
printk("%spcpu-alloc: ", lvl);
}
- printk("[%0*d] ", group_width, group);
+ printk(KERN_CONT "[%0*d] ", group_width, group);
for (unit_end += upa; unit < unit_end; unit++)
if (gi->cpu_map[unit] != NR_CPUS)
- printk("%0*d ", cpu_width,
+ printk(KERN_CONT "%0*d ", cpu_width,
gi->cpu_map[unit]);
else
- printk("%s ", empty_str);
+ printk(KERN_CONT "%s ", empty_str);
}
}
- printk("\n");
+ printk(KERN_CONT "\n");
}
/**
@@ -1650,6 +1650,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
areas[group] = ptr;
base = min(ptr, base);
+ }
+
+ /*
+ * Copy data and free unused parts. This should happen after all
+ * allocations are complete; otherwise, we may end up with
+ * overlapping groups.
+ */
+ for (group = 0; group < ai->nr_groups; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+ void *ptr = areas[group];
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
if (gi->cpu_map[i] == NR_CPUS) {
@@ -1885,6 +1895,8 @@ void __init setup_per_cpu_areas(void)
fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
+ /* kmemleak tracks the percpu allocations separately */
+ kmemleak_free(fc);
ai->dyn_size = unit_size;
ai->unit_size = unit_size;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5a74fea182f..74c0ddaa6fa 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -109,8 +109,8 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
+void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
{
pmd_t pmd = pmd_mksplitting(*pmdp);
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index c20ff48994c..926b4664974 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -371,15 +371,15 @@ static ssize_t process_vm_rw(pid_t pid,
/* Check iovecs */
if (vm_write)
rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
- iovstack_l, &iov_l, 1);
+ iovstack_l, &iov_l);
else
rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
- iovstack_l, &iov_l, 1);
+ iovstack_l, &iov_l);
if (rc <= 0)
goto free_iovecs;
- rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
- iovstack_r, &iov_r, 0);
+ rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
+ iovstack_r, &iov_r);
if (rc <= 0)
goto free_iovecs;
@@ -438,16 +438,16 @@ compat_process_vm_rw(compat_pid_t pid,
if (vm_write)
rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
UIO_FASTIOV, iovstack_l,
- &iov_l, 1);
+ &iov_l);
else
rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
UIO_FASTIOV, iovstack_l,
- &iov_l, 1);
+ &iov_l);
if (rc <= 0)
goto free_iovecs;
- rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
+ rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
UIO_FASTIOV, iovstack_r,
- &iov_r, 0);
+ &iov_r);
if (rc <= 0)
goto free_iovecs;
diff --git a/mm/readahead.c b/mm/readahead.c
index cbcbb02f3e2..ea8f8fa2164 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,8 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
/*
* Initialise a struct file's readahead state. Assumes that the caller has
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping,
ondemand_readahead(mapping, ra, filp, true, offset, req_size);
}
EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+
+static ssize_t
+do_readahead(struct address_space *mapping, struct file *filp,
+ pgoff_t index, unsigned long nr)
+{
+ if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+ return -EINVAL;
+
+ force_page_cache_readahead(mapping, filp, index, nr);
+ return 0;
+}
+
+SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
+{
+ ssize_t ret;
+ struct file *file;
+
+ ret = -EBADF;
+ file = fget(fd);
+ if (file) {
+ if (file->f_mode & FMODE_READ) {
+ struct address_space *mapping = file->f_mapping;
+ pgoff_t start = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long len = end - start + 1;
+ ret = do_readahead(mapping, file, start, len);
+ }
+ fput(file);
+ }
+ return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
+{
+ return SYSC_readahead((int) fd, offset, (size_t) count);
+}
+SYSCALL_ALIAS(sys_readahead, SyS_readahead);
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad584ffb..0f3b7cda2a2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
- /* Pretend the page is referenced if the task has the
- swap token and is in the middle of a page fault. */
- if (mm != current->mm && has_swap_token(mm) &&
- rwsem_is_locked(&mm->mmap_sem))
- referenced++;
-
(*mapcount)--;
if (referenced)
diff --git a/mm/shmem.c b/mm/shmem.c
index f99ff3e50bd..585bd220a21 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
+#include <linux/falloc.h>
#include <linux/splice.h>
#include <linux/security.h>
#include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
char value[0];
};
+/*
+ * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * (with i_mutex making sure that it has only one user at a time):
+ * we would prefer not to enlarge the shmem inode just for that.
+ */
+struct shmem_falloc {
+ pgoff_t start; /* start of range currently being fallocated */
+ pgoff_t next; /* the next page offset to be fallocated */
+ pgoff_t nr_falloced; /* how many new pages have been fallocated */
+ pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+};
+
/* Flag allocation requirements to shmem_getpage */
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
- SGP_WRITE, /* may exceed i_size, may allocate page */
+ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
+ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
};
#ifdef CONFIG_TMPFS
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void)
}
#endif
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
@@ -423,27 +440,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
/*
* Remove range of pages and swap entries from radix tree, and free them.
+ * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ bool unfalloc)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
- pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+ pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
+ unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+ unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
struct pagevec pvec;
pgoff_t indices[PAGEVEC_SIZE];
long nr_swaps_freed = 0;
pgoff_t index;
int i;
- BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+ if (lend == -1)
+ end = -1; /* unsigned, so actually very big */
pagevec_init(&pvec, 0);
index = start;
- while (index <= end) {
+ while (index < end) {
pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
pvec.pages, indices);
if (!pvec.nr)
break;
@@ -452,10 +473,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
struct page *page = pvec.pages[i];
index = indices[i];
- if (index > end)
+ if (index >= end)
break;
if (radix_tree_exceptional_entry(page)) {
+ if (unfalloc)
+ continue;
nr_swaps_freed += !shmem_free_swap(mapping,
index, page);
continue;
@@ -463,9 +486,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
if (!trylock_page(page))
continue;
- if (page->mapping == mapping) {
- VM_BUG_ON(PageWriteback(page));
- truncate_inode_page(mapping, page);
+ if (!unfalloc || !PageUptodate(page)) {
+ if (page->mapping == mapping) {
+ VM_BUG_ON(PageWriteback(page));
+ truncate_inode_page(mapping, page);
+ }
}
unlock_page(page);
}
@@ -476,30 +501,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
index++;
}
- if (partial) {
+ if (partial_start) {
struct page *page = NULL;
shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
if (page) {
- zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+ unsigned int top = PAGE_CACHE_SIZE;
+ if (start > end) {
+ top = partial_end;
+ partial_end = 0;
+ }
+ zero_user_segment(page, partial_start, top);
set_page_dirty(page);
unlock_page(page);
page_cache_release(page);
}
}
+ if (partial_end) {
+ struct page *page = NULL;
+ shmem_getpage(inode, end, &page, SGP_READ, NULL);
+ if (page) {
+ zero_user_segment(page, 0, partial_end);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+ if (start >= end)
+ return;
index = start;
for ( ; ; ) {
cond_resched();
pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
pvec.pages, indices);
if (!pvec.nr) {
- if (index == start)
+ if (index == start || unfalloc)
break;
index = start;
continue;
}
- if (index == start && indices[0] > end) {
+ if ((index == start || unfalloc) && indices[0] >= end) {
shmem_deswap_pagevec(&pvec);
pagevec_release(&pvec);
break;
@@ -509,19 +551,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
struct page *page = pvec.pages[i];
index = indices[i];
- if (index > end)
+ if (index >= end)
break;
if (radix_tree_exceptional_entry(page)) {
+ if (unfalloc)
+ continue;
nr_swaps_freed += !shmem_free_swap(mapping,
index, page);
continue;
}
lock_page(page);
- if (page->mapping == mapping) {
- VM_BUG_ON(PageWriteback(page));
- truncate_inode_page(mapping, page);
+ if (!unfalloc || !PageUptodate(page)) {
+ if (page->mapping == mapping) {
+ VM_BUG_ON(PageWriteback(page));
+ truncate_inode_page(mapping, page);
+ }
}
unlock_page(page);
}
@@ -535,7 +581,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
info->swapped -= nr_swaps_freed;
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+}
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ shmem_undo_range(inode, lstart, lend, false);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -597,19 +647,20 @@ static void shmem_evict_inode(struct inode *inode)
}
BUG_ON(inode->i_blocks);
shmem_free_inode(inode->i_sb);
- end_writeback(inode);
+ clear_inode(inode);
}
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
static int shmem_unuse_inode(struct shmem_inode_info *info,
- swp_entry_t swap, struct page *page)
+ swp_entry_t swap, struct page **pagep)
{
struct address_space *mapping = info->vfs_inode.i_mapping;
void *radswap;
pgoff_t index;
- int error;
+ gfp_t gfp;
+ int error = 0;
radswap = swp_to_radix_entry(swap);
index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +676,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
if (shmem_swaplist.next != &info->swaplist)
list_move_tail(&shmem_swaplist, &info->swaplist);
+ gfp = mapping_gfp_mask(mapping);
+ if (shmem_should_replace_page(*pagep, gfp)) {
+ mutex_unlock(&shmem_swaplist_mutex);
+ error = shmem_replace_page(pagep, gfp, info, index);
+ mutex_lock(&shmem_swaplist_mutex);
+ /*
+ * We needed to drop mutex to make that restrictive page
+ * allocation; but the inode might already be freed by now,
+ * and we cannot refer to inode or mapping or info to check.
+ * However, we do hold page lock on the PageSwapCache page,
+ * so can check if that still has our reference remaining.
+ */
+ if (!page_swapcount(*pagep))
+ error = -ENOENT;
+ }
+
/*
* We rely on shmem_swaplist_mutex, not only to protect the swaplist,
* but also to hold up shmem_evict_inode(): so inode cannot be freed
* beneath us (pagelock doesn't help until the page is in pagecache).
*/
- error = shmem_add_to_page_cache(page, mapping, index,
+ if (!error)
+ error = shmem_add_to_page_cache(*pagep, mapping, index,
GFP_NOWAIT, radswap);
- /* which does mem_cgroup_uncharge_cache_page on error */
-
if (error != -ENOMEM) {
/*
* Truncation and eviction use free_swap_and_cache(), which
* only does trylock page: if we raced, best clean up here.
*/
- delete_from_swap_cache(page);
- set_page_dirty(page);
+ delete_from_swap_cache(*pagep);
+ set_page_dirty(*pagep);
if (!error) {
spin_lock(&info->lock);
info->swapped--;
@@ -660,7 +726,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
struct list_head *this, *next;
struct shmem_inode_info *info;
int found = 0;
- int error;
+ int error = 0;
+
+ /*
+ * There's a faint possibility that swap page was replaced before
+ * caller locked it: it will come back later with the right page.
+ */
+ if (unlikely(!PageSwapCache(page)))
+ goto out;
/*
* Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +749,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
list_for_each_safe(this, next, &shmem_swaplist) {
info = list_entry(this, struct shmem_inode_info, swaplist);
if (info->swapped)
- found = shmem_unuse_inode(info, swap, page);
+ found = shmem_unuse_inode(info, swap, &page);
else
list_del_init(&info->swaplist);
cond_resched();
@@ -685,8 +758,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
}
mutex_unlock(&shmem_swaplist_mutex);
- if (!found)
- mem_cgroup_uncharge_cache_page(page);
if (found < 0)
error = found;
out:
@@ -727,6 +798,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
goto redirty;
}
+
+ /*
+ * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
+ * value into swapfile.c, the only way we can correctly account for a
+ * fallocated page arriving here is now to initialize it and write it.
+ *
+ * That's okay for a page already fallocated earlier, but if we have
+ * not yet completed the fallocation, then (a) we want to keep track
+ * of this page in case we have to undo it, and (b) it may not be a
+ * good idea to continue anyway, once we're pushing into swap. So
+ * reactivate the page, and let shmem_fallocate() quit when too many.
+ */
+ if (!PageUptodate(page)) {
+ if (inode->i_private) {
+ struct shmem_falloc *shmem_falloc;
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ index >= shmem_falloc->start &&
+ index < shmem_falloc->next)
+ shmem_falloc->nr_unswapped++;
+ else
+ shmem_falloc = NULL;
+ spin_unlock(&inode->i_lock);
+ if (shmem_falloc)
+ goto redirty;
+ }
+ clear_highpage(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+
swap = get_swap_page();
if (!swap.val)
goto redirty;
@@ -856,6 +959,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
#endif
/*
+ * When a page is moved from swapcache to shmem filecache (either by the
+ * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * shmem_unuse_inode()), it may have been read in earlier from swap, in
+ * ignorance of the mapping it belongs to. If that mapping has special
+ * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
+ * we may need to copy to a suitable page before moving to filecache.
+ *
+ * In a future release, this may well be extended to respect cpuset and
+ * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
+ * but for now it is a simple matter of zone.
+ */
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+{
+ return page_zonenum(page) > gfp_zone(gfp);
+}
+
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ struct page *oldpage, *newpage;
+ struct address_space *swap_mapping;
+ pgoff_t swap_index;
+ int error;
+
+ oldpage = *pagep;
+ swap_index = page_private(oldpage);
+ swap_mapping = page_mapping(oldpage);
+
+ /*
+ * We have arrived here because our zones are constrained, so don't
+ * limit chance of success by further cpuset and node constraints.
+ */
+ gfp &= ~GFP_CONSTRAINT_MASK;
+ newpage = shmem_alloc_page(gfp, info, index);
+ if (!newpage)
+ return -ENOMEM;
+ VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
+
+ *pagep = newpage;
+ page_cache_get(newpage);
+ copy_highpage(newpage, oldpage);
+
+ VM_BUG_ON(!PageLocked(oldpage));
+ __set_page_locked(newpage);
+ VM_BUG_ON(!PageUptodate(oldpage));
+ SetPageUptodate(newpage);
+ VM_BUG_ON(!PageSwapBacked(oldpage));
+ SetPageSwapBacked(newpage);
+ VM_BUG_ON(!swap_index);
+ set_page_private(newpage, swap_index);
+ VM_BUG_ON(!PageSwapCache(oldpage));
+ SetPageSwapCache(newpage);
+
+ /*
+ * Our caller will very soon move newpage out of swapcache, but it's
+ * a nice clean interface for us to replace oldpage by newpage there.
+ */
+ spin_lock_irq(&swap_mapping->tree_lock);
+ error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
+ newpage);
+ __inc_zone_page_state(newpage, NR_FILE_PAGES);
+ __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ spin_unlock_irq(&swap_mapping->tree_lock);
+ BUG_ON(error);
+
+ mem_cgroup_replace_page_cache(oldpage, newpage);
+ lru_cache_add_anon(newpage);
+
+ ClearPageSwapCache(oldpage);
+ set_page_private(oldpage, 0);
+
+ unlock_page(oldpage);
+ page_cache_release(oldpage);
+ page_cache_release(oldpage);
+ return 0;
+}
+
+/*
* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
@@ -872,6 +1053,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
swp_entry_t swap;
int error;
int once = 0;
+ int alloced = 0;
if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
return -EFBIG;
@@ -883,19 +1065,21 @@ repeat:
page = NULL;
}
- if (sgp != SGP_WRITE &&
+ if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL;
goto failed;
}
+ /* fallocated page? */
+ if (page && !PageUptodate(page)) {
+ if (sgp != SGP_READ)
+ goto clear;
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+ }
if (page || (sgp == SGP_READ && !swap.val)) {
- /*
- * Once we can get the page lock, it must be uptodate:
- * if there were an error in reading back from swap,
- * the page would not be inserted into the filecache.
- */
- BUG_ON(page && !PageUptodate(page));
*pagep = page;
return 0;
}
@@ -923,19 +1107,20 @@ repeat:
/* We have to do this with page locked to prevent races */
lock_page(page);
+ if (!PageSwapCache(page) || page->mapping) {
+ error = -EEXIST; /* try again */
+ goto failed;
+ }
if (!PageUptodate(page)) {
error = -EIO;
goto failed;
}
wait_on_page_writeback(page);
- /* Someone may have already done it for us */
- if (page->mapping) {
- if (page->mapping == mapping &&
- page->index == index)
- goto done;
- error = -EEXIST;
- goto failed;
+ if (shmem_should_replace_page(page, gfp)) {
+ error = shmem_replace_page(&page, gfp, info, index);
+ if (error)
+ goto failed;
}
error = mem_cgroup_cache_charge(page, current->mm,
@@ -991,19 +1176,36 @@ repeat:
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+ alloced = true;
- clear_highpage(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
+ /*
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ */
+ if (sgp == SGP_FALLOC)
+ sgp = SGP_WRITE;
+clear:
+ /*
+ * Let SGP_WRITE caller clear ends if write does not fill page;
+ * but SGP_FALLOC on a page fallocated earlier must initialize
+ * it now, lest undo on failure cancel our earlier guarantee.
+ */
+ if (sgp != SGP_WRITE) {
+ clear_highpage(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
if (sgp == SGP_DIRTY)
set_page_dirty(page);
}
-done:
+
/* Perhaps the file has been truncated since we checked */
- if (sgp != SGP_WRITE &&
+ if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL;
- goto trunc;
+ if (alloced)
+ goto trunc;
+ else
+ goto failed;
}
*pagep = page;
return 0;
@@ -1012,6 +1214,7 @@ done:
* Error recovery.
*/
trunc:
+ info = SHMEM_I(inode);
ClearPageDirty(page);
delete_from_page_cache(page);
spin_lock(&info->lock);
@@ -1019,6 +1222,7 @@ trunc:
inode->i_blocks -= BLOCKS_PER_PAGE;
spin_unlock(&info->lock);
decused:
+ sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks)
percpu_counter_add(&sbinfo->used_blocks, -1);
unacct:
@@ -1204,6 +1408,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
if (pos + copied > inode->i_size)
i_size_write(inode, pos + copied);
+ if (!PageUptodate(page)) {
+ if (copied < PAGE_CACHE_SIZE) {
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ zero_user_segments(page, 0, from,
+ from + copied, PAGE_CACHE_SIZE);
+ }
+ SetPageUptodate(page);
+ }
set_page_dirty(page);
unlock_page(page);
page_cache_release(page);
@@ -1462,6 +1674,199 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
return error;
}
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+ pgoff_t index, pgoff_t end, int origin)
+{
+ struct page *page;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool done = false;
+ int i;
+
+ pagevec_init(&pvec, 0);
+ pvec.nr = 1; /* start small: we may be there already */
+ while (!done) {
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ pvec.nr, pvec.pages, indices);
+ if (!pvec.nr) {
+ if (origin == SEEK_DATA)
+ index = end;
+ break;
+ }
+ for (i = 0; i < pvec.nr; i++, index++) {
+ if (index < indices[i]) {
+ if (origin == SEEK_HOLE) {
+ done = true;
+ break;
+ }
+ index = indices[i];
+ }
+ page = pvec.pages[i];
+ if (page && !radix_tree_exceptional_entry(page)) {
+ if (!PageUptodate(page))
+ page = NULL;
+ }
+ if (index >= end ||
+ (page && origin == SEEK_DATA) ||
+ (!page && origin == SEEK_HOLE)) {
+ done = true;
+ break;
+ }
+ }
+ shmem_deswap_pagevec(&pvec);
+ pagevec_release(&pvec);
+ pvec.nr = PAGEVEC_SIZE;
+ cond_resched();
+ }
+ return index;
+}
+
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct address_space *mapping;
+ struct inode *inode;
+ pgoff_t start, end;
+ loff_t new_offset;
+
+ if (origin != SEEK_DATA && origin != SEEK_HOLE)
+ return generic_file_llseek_size(file, offset, origin,
+ MAX_LFS_FILESIZE);
+ mapping = file->f_mapping;
+ inode = mapping->host;
+ mutex_lock(&inode->i_mutex);
+ /* We're holding i_mutex so we can access i_size directly */
+
+ if (offset < 0)
+ offset = -EINVAL;
+ else if (offset >= inode->i_size)
+ offset = -ENXIO;
+ else {
+ start = offset >> PAGE_CACHE_SHIFT;
+ end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ new_offset = shmem_seek_hole_data(mapping, start, end, origin);
+ new_offset <<= PAGE_CACHE_SHIFT;
+ if (new_offset > offset) {
+ if (new_offset < inode->i_size)
+ offset = new_offset;
+ else if (origin == SEEK_DATA)
+ offset = -ENXIO;
+ else
+ offset = inode->i_size;
+ }
+ }
+
+ if (offset >= 0 && offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
+static long shmem_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct shmem_falloc shmem_falloc;
+ pgoff_t start, index, end;
+ int error;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ struct address_space *mapping = file->f_mapping;
+ loff_t unmap_start = round_up(offset, PAGE_SIZE);
+ loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+
+ if ((u64)unmap_end > (u64)unmap_start)
+ unmap_mapping_range(mapping, unmap_start,
+ 1 + unmap_end - unmap_start, 0);
+ shmem_truncate_range(inode, offset, offset + len - 1);
+ /* No need to unmap again: hole-punching leaves COWed pages */
+ error = 0;
+ goto out;
+ }
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ start = offset >> PAGE_CACHE_SHIFT;
+ end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ /* Try to avoid a swapstorm if len is impossible to satisfy */
+ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ shmem_falloc.start = start;
+ shmem_falloc.next = start;
+ shmem_falloc.nr_falloced = 0;
+ shmem_falloc.nr_unswapped = 0;
+ spin_lock(&inode->i_lock);
+ inode->i_private = &shmem_falloc;
+ spin_unlock(&inode->i_lock);
+
+ for (index = start; index < end; index++) {
+ struct page *page;
+
+ /*
+ * Good, the fallocate(2) manpage permits EINTR: we may have
+ * been interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current))
+ error = -EINTR;
+ else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
+ error = -ENOMEM;
+ else
+ error = shmem_getpage(inode, index, &page, SGP_FALLOC,
+ NULL);
+ if (error) {
+ /* Remove the !PageUptodate pages we added */
+ shmem_undo_range(inode,
+ (loff_t)start << PAGE_CACHE_SHIFT,
+ (loff_t)index << PAGE_CACHE_SHIFT, true);
+ goto undone;
+ }
+
+ /*
+ * Inform shmem_writepage() how far we have reached.
+ * No need for lock or barrier: we have the page lock.
+ */
+ shmem_falloc.next++;
+ if (!PageUptodate(page))
+ shmem_falloc.nr_falloced++;
+
+ /*
+ * If !PageUptodate, leave it that way so that freeable pages
+ * can be recognized if we need to rollback on error later.
+ * But set_page_dirty so that memory pressure will swap rather
+ * than free the pages we are allocating (and SGP_CACHE pages
+ * might still be clean: we now need to mark those dirty too).
+ */
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+ cond_resched();
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+undone:
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1665,6 +2070,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
kaddr = kmap_atomic(page);
memcpy(kaddr, symname, len);
kunmap_atomic(kaddr);
+ SetPageUptodate(page);
set_page_dirty(page);
unlock_page(page);
page_cache_release(page);
@@ -2033,11 +2439,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
return dentry;
}
-static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
- int connectable)
+static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
+ struct inode *parent)
{
- struct inode *inode = dentry->d_inode;
-
if (*len < 3) {
*len = 3;
return 255;
@@ -2075,6 +2479,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
bool remount)
{
char *this_char, *value, *rest;
+ uid_t uid;
+ gid_t gid;
while (options != NULL) {
this_char = options;
@@ -2134,15 +2540,21 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
} else if (!strcmp(this_char,"uid")) {
if (remount)
continue;
- sbinfo->uid = simple_strtoul(value, &rest, 0);
+ uid = simple_strtoul(value, &rest, 0);
if (*rest)
goto bad_val;
+ sbinfo->uid = make_kuid(current_user_ns(), uid);
+ if (!uid_valid(sbinfo->uid))
+ goto bad_val;
} else if (!strcmp(this_char,"gid")) {
if (remount)
continue;
- sbinfo->gid = simple_strtoul(value, &rest, 0);
+ gid = simple_strtoul(value, &rest, 0);
if (*rest)
goto bad_val;
+ sbinfo->gid = make_kgid(current_user_ns(), gid);
+ if (!gid_valid(sbinfo->gid))
+ goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
if (mpol_parse_str(value, &sbinfo->mpol, 1))
goto bad_val;
@@ -2210,10 +2622,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
seq_printf(seq, ",mode=%03ho", sbinfo->mode);
- if (sbinfo->uid != 0)
- seq_printf(seq, ",uid=%u", sbinfo->uid);
- if (sbinfo->gid != 0)
- seq_printf(seq, ",gid=%u", sbinfo->gid);
+ if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
+ seq_printf(seq, ",uid=%u",
+ from_kuid_munged(&init_user_ns, sbinfo->uid));
+ if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u",
+ from_kgid_munged(&init_user_ns, sbinfo->gid));
shmem_show_mpol(seq, sbinfo->mpol);
return 0;
}
@@ -2260,6 +2674,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
}
}
sb->s_export_op = &shmem_export_ops;
+ sb->s_flags |= MS_NOSEC;
#else
sb->s_flags |= MS_NOUSER;
#endif
@@ -2354,7 +2769,7 @@ static const struct address_space_operations shmem_aops = {
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
- .llseek = generic_file_llseek,
+ .llseek = shmem_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = shmem_file_aio_read,
@@ -2362,12 +2777,12 @@ static const struct file_operations shmem_file_operations = {
.fsync = noop_fsync,
.splice_read = shmem_file_splice_read,
.splice_write = generic_file_splice_write,
+ .fallocate = shmem_fallocate,
#endif
};
static const struct inode_operations shmem_inode_operations = {
.setattr = shmem_setattr,
- .truncate_range = shmem_truncate_range,
#ifdef CONFIG_TMPFS_XATTR
.setxattr = shmem_setxattr,
.getxattr = shmem_getxattr,
diff --git a/mm/slub.c b/mm/slub.c
index ef2d67564a5..8c691fa1cf3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2043,7 +2043,7 @@ static bool has_cpu_slab(int cpu, void *info)
struct kmem_cache *s = info;
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- return !!(c->page);
+ return c->page || c->partial;
}
static void flush_all(struct kmem_cache *s)
diff --git a/mm/sparse.c b/mm/sparse.c
index a8bc7d364de..6a4bf9160e8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -273,10 +273,10 @@ static unsigned long *__kmalloc_section_usemap(void)
#ifdef CONFIG_MEMORY_HOTREMOVE
static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
- unsigned long count)
+ unsigned long size)
{
- unsigned long section_nr;
-
+ pg_data_t *host_pgdat;
+ unsigned long goal;
/*
* A page may contain usemaps for other sections preventing the
* page being freed and making a section unremovable while
@@ -287,8 +287,10 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
- section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
- return alloc_bootmem_section(usemap_size() * count, section_nr);
+ goal = __pa(pgdat) & PAGE_SECTION_MASK;
+ host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
+ return __alloc_bootmem_node_nopanic(host_pgdat, size,
+ SMP_CACHE_BYTES, goal);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -332,9 +334,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
#else
static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
- unsigned long count)
+ unsigned long size)
{
- return NULL;
+ return alloc_bootmem_node_nopanic(pgdat, size);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -352,13 +354,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
int size = usemap_size();
usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
- usemap_count);
+ size * usemap_count);
if (!usemap) {
- usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
- if (!usemap) {
- printk(KERN_WARNING "%s: allocation failed\n", __func__);
- return;
- }
+ printk(KERN_WARNING "%s: allocation failed\n", __func__);
+ return;
}
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
diff --git a/mm/swap.c b/mm/swap.c
index 5c13f133897..4e7e2ec6707 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,13 +47,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- unsigned long flags;
struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
+ unsigned long flags;
spin_lock_irqsave(&zone->lru_lock, flags);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
- del_page_from_lru_list(zone, page, page_off_lru(page));
+ del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
}
@@ -82,6 +84,25 @@ static void put_compound_page(struct page *page)
if (likely(page != page_head &&
get_page_unless_zero(page_head))) {
unsigned long flags;
+
+ /*
+ * THP can not break up slab pages so avoid taking
+ * compound_lock(). Slab performs non-atomic bit ops
+ * on page->flags for better performance. In particular
+ * slab_unlock() in slub used to be a hot path. It is
+ * still hot on arches that do not support
+ * this_cpu_cmpxchg_double().
+ */
+ if (PageSlab(page_head)) {
+ if (PageTail(page)) {
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+
+ atomic_dec(&page->_mapcount);
+ goto skip_lock_tail;
+ } else
+ goto skip_lock;
+ }
/*
* page_head wasn't a dangling pointer but it
* may not be a head page anymore by the time
@@ -92,10 +113,10 @@ static void put_compound_page(struct page *page)
if (unlikely(!PageTail(page))) {
/* __split_huge_page_refcount run before us */
compound_unlock_irqrestore(page_head, flags);
- VM_BUG_ON(PageHead(page_head));
+skip_lock:
if (put_page_testzero(page_head))
__put_single_page(page_head);
- out_put_single:
+out_put_single:
if (put_page_testzero(page))
__put_single_page(page);
return;
@@ -115,6 +136,8 @@ static void put_compound_page(struct page *page)
VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
VM_BUG_ON(atomic_read(&page->_count) != 0);
compound_unlock_irqrestore(page_head, flags);
+
+skip_lock_tail:
if (put_page_testzero(page_head)) {
if (PageHead(page_head))
__put_compound_page(page_head);
@@ -162,6 +185,18 @@ bool __get_page_tail(struct page *page)
struct page *page_head = compound_trans_head(page);
if (likely(page != page_head && get_page_unless_zero(page_head))) {
+
+ /* Ref to put_compound_page() comment. */
+ if (PageSlab(page_head)) {
+ if (likely(PageTail(page))) {
+ __get_page_tail_foll(page, false);
+ return true;
+ } else {
+ put_page(page_head);
+ return false;
+ }
+ }
+
/*
* page_head wasn't a dangling pointer but it
* may not be a head page anymore by the time
@@ -202,11 +237,12 @@ void put_pages_list(struct list_head *pages)
EXPORT_SYMBOL(put_pages_list);
static void pagevec_lru_move_fn(struct pagevec *pvec,
- void (*move_fn)(struct page *page, void *arg),
- void *arg)
+ void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
+ void *arg)
{
int i;
struct zone *zone = NULL;
+ struct lruvec *lruvec;
unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) {
@@ -220,7 +256,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
spin_lock_irqsave(&zone->lru_lock, flags);
}
- (*move_fn)(page, arg);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ (*move_fn)(page, lruvec, arg);
}
if (zone)
spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -228,16 +265,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
pagevec_reinit(pvec);
}
-static void pagevec_move_tail_fn(struct page *page, void *arg)
+static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
{
int *pgmoved = arg;
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
enum lru_list lru = page_lru_base_type(page);
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_lru_move_lists(page_zone(page),
- page, lru, lru);
list_move_tail(&page->lru, &lruvec->lists[lru]);
(*pgmoved)++;
}
@@ -276,41 +310,30 @@ void rotate_reclaimable_page(struct page *page)
}
}
-static void update_page_reclaim_stat(struct zone *zone, struct page *page,
+static void update_page_reclaim_stat(struct lruvec *lruvec,
int file, int rotated)
{
- struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
- struct zone_reclaim_stat *memcg_reclaim_stat;
-
- memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
reclaim_stat->recent_scanned[file]++;
if (rotated)
reclaim_stat->recent_rotated[file]++;
-
- if (!memcg_reclaim_stat)
- return;
-
- memcg_reclaim_stat->recent_scanned[file]++;
- if (rotated)
- memcg_reclaim_stat->recent_rotated[file]++;
}
-static void __activate_page(struct page *page, void *arg)
+static void __activate_page(struct page *page, struct lruvec *lruvec,
+ void *arg)
{
- struct zone *zone = page_zone(page);
-
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
int file = page_is_file_cache(page);
int lru = page_lru_base_type(page);
- del_page_from_lru_list(zone, page, lru);
+ del_page_from_lru_list(page, lruvec, lru);
SetPageActive(page);
lru += LRU_ACTIVE;
- add_page_to_lru_list(zone, page, lru);
- __count_vm_event(PGACTIVATE);
+ add_page_to_lru_list(page, lruvec, lru);
- update_page_reclaim_stat(zone, page, file, 1);
+ __count_vm_event(PGACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 1);
}
}
@@ -347,7 +370,7 @@ void activate_page(struct page *page)
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
- __activate_page(page, NULL);
+ __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
spin_unlock_irq(&zone->lru_lock);
}
#endif
@@ -414,11 +437,13 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru)
void add_page_to_unevictable_list(struct page *page)
{
struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
SetPageUnevictable(page);
SetPageLRU(page);
- add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
+ add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
spin_unlock_irq(&zone->lru_lock);
}
@@ -443,11 +468,11 @@ void add_page_to_unevictable_list(struct page *page)
* be write it out by flusher threads as this is much more effective
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_fn(struct page *page, void *arg)
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
{
int lru, file;
bool active;
- struct zone *zone = page_zone(page);
if (!PageLRU(page))
return;
@@ -460,13 +485,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
return;
active = PageActive(page);
-
file = page_is_file_cache(page);
lru = page_lru_base_type(page);
- del_page_from_lru_list(zone, page, lru + active);
+
+ del_page_from_lru_list(page, lruvec, lru + active);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(zone, page, lru);
+ add_page_to_lru_list(page, lruvec, lru);
if (PageWriteback(page) || PageDirty(page)) {
/*
@@ -476,19 +501,17 @@ static void lru_deactivate_fn(struct page *page, void *arg)
*/
SetPageReclaim(page);
} else {
- struct lruvec *lruvec;
/*
* The page's writeback ends up during pagevec
* We moves tha page into tail of inactive.
*/
- lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
list_move_tail(&page->lru, &lruvec->lists[lru]);
__count_vm_event(PGROTATED);
}
if (active)
__count_vm_event(PGDEACTIVATE);
- update_page_reclaim_stat(zone, page, file, 0);
+ update_page_reclaim_stat(lruvec, file, 0);
}
/*
@@ -588,6 +611,7 @@ void release_pages(struct page **pages, int nr, int cold)
int i;
LIST_HEAD(pages_to_free);
struct zone *zone = NULL;
+ struct lruvec *lruvec;
unsigned long uninitialized_var(flags);
for (i = 0; i < nr; i++) {
@@ -615,9 +639,11 @@ void release_pages(struct page **pages, int nr, int cold)
zone = pagezone;
spin_lock_irqsave(&zone->lru_lock, flags);
}
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
- del_page_from_lru_list(zone, page, page_off_lru(page));
+ del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
list_add(&page->lru, &pages_to_free);
@@ -649,8 +675,8 @@ EXPORT_SYMBOL(__pagevec_release);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* used by __split_huge_page_refcount() */
-void lru_add_page_tail(struct zone* zone,
- struct page *page, struct page *page_tail)
+void lru_add_page_tail(struct page *page, struct page *page_tail,
+ struct lruvec *lruvec)
{
int uninitialized_var(active);
enum lru_list lru;
@@ -659,7 +685,8 @@ void lru_add_page_tail(struct zone* zone,
VM_BUG_ON(!PageHead(page));
VM_BUG_ON(PageCompound(page_tail));
VM_BUG_ON(PageLRU(page_tail));
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock));
+ VM_BUG_ON(NR_CPUS != 1 &&
+ !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
SetPageLRU(page_tail);
@@ -688,20 +715,20 @@ void lru_add_page_tail(struct zone* zone,
* Use the standard add function to put page_tail on the list,
* but then correct its position so they all end up in order.
*/
- add_page_to_lru_list(zone, page_tail, lru);
+ add_page_to_lru_list(page_tail, lruvec, lru);
list_head = page_tail->lru.prev;
list_move_tail(&page_tail->lru, list_head);
}
if (!PageUnevictable(page))
- update_page_reclaim_stat(zone, page_tail, file, active);
+ update_page_reclaim_stat(lruvec, file, active);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __pagevec_lru_add_fn(struct page *page, void *arg)
+static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
{
enum lru_list lru = (enum lru_list)arg;
- struct zone *zone = page_zone(page);
int file = is_file_lru(lru);
int active = is_active_lru(lru);
@@ -712,8 +739,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
SetPageLRU(page);
if (active)
SetPageActive(page);
- add_page_to_lru_list(zone, page, lru);
- update_page_reclaim_stat(zone, page, file, active);
+ add_page_to_lru_list(page, lruvec, lru);
+ update_page_reclaim_stat(lruvec, file, active);
}
/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9d3dd3763cf..4c5ff7f284d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -26,7 +26,7 @@
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .set_page_dirty = __set_page_dirty_no_writeback,
.migratepage = migrate_page,
};
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fafc26d1b1d..457b10baef5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -601,7 +601,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
-static inline int page_swapcount(struct page *page)
+int page_swapcount(struct page *page)
{
int count = 0;
struct swap_info_struct *p;
@@ -717,37 +717,6 @@ int free_swap_and_cache(swp_entry_t entry)
return p != NULL;
}
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_count_swap_user - count the user of a swap entry
- * @ent: the swap entry to be checked
- * @pagep: the pointer for the swap cache page of the entry to be stored
- *
- * Returns the number of the user of the swap entry. The number is valid only
- * for swaps of anonymous pages.
- * If the entry is found on swap cache, the page is stored to pagep with
- * refcount of it being incremented.
- */
-int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
-{
- struct page *page;
- struct swap_info_struct *p;
- int count = 0;
-
- page = find_get_page(&swapper_space, ent.val);
- if (page)
- count += page_mapcount(page);
- p = swap_info_get(ent);
- if (p) {
- count += swap_count(p->swap_map[swp_offset(ent)]);
- spin_unlock(&swap_lock);
- }
-
- *pagep = page;
- return count;
-}
-#endif
-
#ifdef CONFIG_HIBERNATION
/*
* Find the swap type that corresponds to given device (if any).
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index 57ad495dbd5..00000000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * mm/thrash.c
- *
- * Copyright (C) 2004, Red Hat, Inc.
- * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
- * Released under the GPL, see the file COPYING for details.
- *
- * Simple token based thrashing protection, using the algorithm
- * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
- *
- * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
- * Improved algorithm to pass token:
- * Each task has a priority which is incremented if it contended
- * for the token in an interval less than its previous attempt.
- * If the token is acquired, that task's priority is boosted to prevent
- * the token from bouncing around too often and to let the task make
- * some progress in its execution.
- */
-
-#include <linux/jiffies.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/swap.h>
-#include <linux/memcontrol.h>
-
-#include <trace/events/vmscan.h>
-
-#define TOKEN_AGING_INTERVAL (0xFF)
-
-static DEFINE_SPINLOCK(swap_token_lock);
-struct mm_struct *swap_token_mm;
-static struct mem_cgroup *swap_token_memcg;
-
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
- struct mem_cgroup *memcg;
-
- memcg = try_get_mem_cgroup_from_mm(mm);
- if (memcg)
- css_put(mem_cgroup_css(memcg));
-
- return memcg;
-}
-#else
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
- return NULL;
-}
-#endif
-
-void grab_swap_token(struct mm_struct *mm)
-{
- int current_interval;
- unsigned int old_prio = mm->token_priority;
- static unsigned int global_faults;
- static unsigned int last_aging;
-
- global_faults++;
-
- current_interval = global_faults - mm->faultstamp;
-
- if (!spin_trylock(&swap_token_lock))
- return;
-
- /* First come first served */
- if (!swap_token_mm)
- goto replace_token;
-
- /*
- * Usually, we don't need priority aging because long interval faults
- * makes priority decrease quickly. But there is one exception. If the
- * token owner task is sleeping, it never make long interval faults.
- * Thus, we need a priority aging mechanism instead. The requirements
- * of priority aging are
- * 1) An aging interval is reasonable enough long. Too short aging
- * interval makes quick swap token lost and decrease performance.
- * 2) The swap token owner task have to get priority aging even if
- * it's under sleep.
- */
- if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
- swap_token_mm->token_priority /= 2;
- last_aging = global_faults;
- }
-
- if (mm == swap_token_mm) {
- mm->token_priority += 2;
- goto update_priority;
- }
-
- if (current_interval < mm->last_interval)
- mm->token_priority++;
- else {
- if (likely(mm->token_priority > 0))
- mm->token_priority--;
- }
-
- /* Check if we deserve the token */
- if (mm->token_priority > swap_token_mm->token_priority)
- goto replace_token;
-
-update_priority:
- trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
-
-out:
- mm->faultstamp = global_faults;
- mm->last_interval = current_interval;
- spin_unlock(&swap_token_lock);
- return;
-
-replace_token:
- mm->token_priority += 2;
- trace_replace_swap_token(swap_token_mm, mm);
- swap_token_mm = mm;
- swap_token_memcg = swap_token_memcg_from_mm(mm);
- last_aging = global_faults;
- goto out;
-}
-
-/* Called on process exit. */
-void __put_swap_token(struct mm_struct *mm)
-{
- spin_lock(&swap_token_lock);
- if (likely(mm == swap_token_mm)) {
- trace_put_swap_token(swap_token_mm);
- swap_token_mm = NULL;
- swap_token_memcg = NULL;
- }
- spin_unlock(&swap_token_lock);
-}
-
-static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
-{
- if (!a)
- return true;
- if (!b)
- return true;
- if (a == b)
- return true;
- return false;
-}
-
-void disable_swap_token(struct mem_cgroup *memcg)
-{
- /* memcg reclaim don't disable unrelated mm token. */
- if (match_memcg(memcg, swap_token_memcg)) {
- spin_lock(&swap_token_lock);
- if (match_memcg(memcg, swap_token_memcg)) {
- trace_disable_swap_token(swap_token_mm);
- swap_token_mm = NULL;
- swap_token_memcg = NULL;
- }
- spin_unlock(&swap_token_lock);
- }
-}
diff --git a/mm/truncate.c b/mm/truncate.c
index 61a183b89df..75801acdaac 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize)
}
EXPORT_SYMBOL(vmtruncate);
-int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
-{
- struct address_space *mapping = inode->i_mapping;
- loff_t holebegin = round_up(lstart, PAGE_SIZE);
- loff_t holelen = 1 + lend - holebegin;
-
- /*
- * If the underlying filesystem is not going to provide
- * a way to truncate a range of blocks (punch a hole) -
- * we should return failure right now.
- */
- if (!inode->i_op->truncate_range)
- return -ENOSYS;
-
- mutex_lock(&inode->i_mutex);
- inode_dio_wait(inode);
- unmap_mapping_range(mapping, holebegin, holelen, 1);
- inode->i_op->truncate_range(inode, lstart, lend);
- /* unmap again to remove racily COWed private pages */
- unmap_mapping_range(mapping, holebegin, holelen, 1);
- mutex_unlock(&inode->i_mutex);
-
- return 0;
-}
-
/**
* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
* @inode: inode
diff --git a/mm/util.c b/mm/util.c
index ae962b31de8..8c7265afa29 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
+#include <linux/security.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
+unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long pgoff)
+{
+ unsigned long ret;
+ struct mm_struct *mm = current->mm;
+
+ ret = security_mmap_file(file, prot, flag);
+ if (!ret) {
+ down_write(&mm->mmap_sem);
+ ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
+ up_write(&mm->mmap_sem);
+ }
+ return ret;
+}
+
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long offset)
+{
+ if (unlikely(offset + PAGE_ALIGN(len) < offset))
+ return -EINVAL;
+ if (unlikely(offset & ~PAGE_MASK))
+ return -EINVAL;
+
+ return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(vm_mmap);
+
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 94dff883b44..2aad49981b5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1185,9 +1185,10 @@ void __init vmalloc_init(void)
/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
- va->flags = tmp->flags | VM_VM_AREA;
+ va->flags = VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
+ va->vm = tmp;
__insert_vmap_area(va);
}
@@ -2375,8 +2376,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
return NULL;
}
- vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
- vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
+ vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
+ vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
if (!vas || !vms)
goto err_free2;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33c332bbab7..eeb3bc9d1d3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,24 +53,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
-/*
- * reclaim_mode determines how the inactive list is shrunk
- * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
- * RECLAIM_MODE_ASYNC: Do not block
- * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
- * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
- * page from the LRU and reclaim all pages within a
- * naturally aligned range
- * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
- * order-0 pages and then compact the zone
- */
-typedef unsigned __bitwise__ reclaim_mode_t;
-#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
-#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
-#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
-#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
-#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
-
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -96,11 +78,8 @@ struct scan_control {
int order;
- /*
- * Intend to reclaim enough continuous memory rather than reclaim
- * enough amount of memory. i.e, mode for high order allocation.
- */
- reclaim_mode_t reclaim_mode;
+ /* Scan (total_size >> priority) pages at once */
+ int priority;
/*
* The memory cgroup that hit its limit and as a result is the
@@ -115,11 +94,6 @@ struct scan_control {
nodemask_t *nodemask;
};
-struct mem_cgroup_zone {
- struct mem_cgroup *mem_cgroup;
- struct zone *zone;
-};
-
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
#ifdef ARCH_HAS_PREFETCH
@@ -164,44 +138,21 @@ static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
}
-
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
- return !mz->mem_cgroup;
-}
#else
static bool global_reclaim(struct scan_control *sc)
{
return true;
}
-
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
- return true;
-}
#endif
-static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
+static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
- if (!scanning_global_lru(mz))
- return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
+ if (!mem_cgroup_disabled())
+ return mem_cgroup_get_lru_size(lruvec, lru);
- return &mz->zone->reclaim_stat;
+ return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
}
-static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
- enum lru_list lru)
-{
- if (!scanning_global_lru(mz))
- return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
- zone_to_nid(mz->zone),
- zone_idx(mz->zone),
- BIT(lru));
-
- return zone_page_state(mz->zone, NR_LRU_BASE + lru);
-}
-
-
/*
* Add a shrinker callback to be called from the vm
*/
@@ -364,39 +315,6 @@ out:
return ret;
}
-static void set_reclaim_mode(int priority, struct scan_control *sc,
- bool sync)
-{
- reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
-
- /*
- * Initially assume we are entering either lumpy reclaim or
- * reclaim/compaction.Depending on the order, we will either set the
- * sync mode or just reclaim order-0 pages later.
- */
- if (COMPACTION_BUILD)
- sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
- else
- sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
-
- /*
- * Avoid using lumpy reclaim or reclaim/compaction if possible by
- * restricting when its set to either costly allocations or when
- * under memory pressure
- */
- if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- sc->reclaim_mode |= syncmode;
- else if (sc->order && priority < DEF_PRIORITY - 2)
- sc->reclaim_mode |= syncmode;
- else
- sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
-}
-
-static void reset_reclaim_mode(struct scan_control *sc)
-{
- sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
-}
-
static inline int is_page_cache_freeable(struct page *page)
{
/*
@@ -416,10 +334,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
return 1;
if (bdi == current->backing_dev_info)
return 1;
-
- /* lumpy reclaim for hugepage often need a lot of write */
- if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- return 1;
return 0;
}
@@ -523,8 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
- trace_mm_vmscan_writepage(page,
- trace_reclaim_flags(page, sc->reclaim_mode));
+ trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -701,19 +614,15 @@ enum page_references {
};
static enum page_references page_check_references(struct page *page,
- struct mem_cgroup_zone *mz,
struct scan_control *sc)
{
int referenced_ptes, referenced_page;
unsigned long vm_flags;
- referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
+ referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
+ &vm_flags);
referenced_page = TestClearPageReferenced(page);
- /* Lumpy reclaim - ignore references */
- if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
- return PAGEREF_RECLAIM;
-
/*
* Mlock lost the isolation race with us. Let try_to_unmap()
* move the page to the unevictable list.
@@ -722,7 +631,7 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
if (referenced_ptes) {
- if (PageAnon(page))
+ if (PageSwapBacked(page))
return PAGEREF_ACTIVATE;
/*
* All mapped pages start out with page table
@@ -763,9 +672,8 @@ static enum page_references page_check_references(struct page *page,
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct mem_cgroup_zone *mz,
+ struct zone *zone,
struct scan_control *sc,
- int priority,
unsigned long *ret_nr_dirty,
unsigned long *ret_nr_writeback)
{
@@ -794,7 +702,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
VM_BUG_ON(PageActive(page));
- VM_BUG_ON(page_zone(page) != mz->zone);
+ VM_BUG_ON(page_zone(page) != zone);
sc->nr_scanned++;
@@ -813,22 +721,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageWriteback(page)) {
nr_writeback++;
- /*
- * Synchronous reclaim cannot queue pages for
- * writeback due to the possibility of stack overflow
- * but if it encounters a page under writeback, wait
- * for the IO to complete.
- */
- if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
- may_enter_fs)
- wait_on_page_writeback(page);
- else {
- unlock_page(page);
- goto keep_lumpy;
- }
+ unlock_page(page);
+ goto keep;
}
- references = page_check_references(page, mz, sc);
+ references = page_check_references(page, sc);
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
@@ -879,7 +776,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* unless under significant pressure.
*/
if (page_is_file_cache(page) &&
- (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+ (!current_is_kswapd() ||
+ sc->priority >= DEF_PRIORITY - 2)) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
@@ -908,7 +806,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto activate_locked;
case PAGE_SUCCESS:
if (PageWriteback(page))
- goto keep_lumpy;
+ goto keep;
if (PageDirty(page))
goto keep;
@@ -994,7 +892,6 @@ cull_mlocked:
try_to_free_swap(page);
unlock_page(page);
putback_lru_page(page);
- reset_reclaim_mode(sc);
continue;
activate_locked:
@@ -1007,8 +904,6 @@ activate_locked:
keep_locked:
unlock_page(page);
keep:
- reset_reclaim_mode(sc);
-keep_lumpy:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
@@ -1020,7 +915,7 @@ keep_lumpy:
* will encounter the same problem
*/
if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
- zone_set_flag(mz->zone, ZONE_CONGESTED);
+ zone_set_flag(zone, ZONE_CONGESTED);
free_hot_cold_page_list(&free_pages, 1);
@@ -1041,34 +936,15 @@ keep_lumpy:
*
* returns 0 on success, -ve errno on failure.
*/
-int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode)
{
- bool all_lru_mode;
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
- all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
- (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
-
- /*
- * When checking the active state, we need to be sure we are
- * dealing with comparible boolean values. Take the logical not
- * of each.
- */
- if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
- return ret;
-
- if (!all_lru_mode && !!page_is_file_cache(page) != file)
- return ret;
-
- /*
- * When this function is being called for lumpy reclaim, we
- * initially look into all LRU pages, active, inactive and
- * unevictable; only give shrink_page_list evictable pages.
- */
+ /* Do not give back unevictable pages for compaction */
if (PageUnevictable(page))
return ret;
@@ -1135,54 +1011,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
- * @mz: The mem_cgroup_zone to pull pages from.
+ * @lruvec: The LRU vector to pull pages from.
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
* @sc: The scan_control struct for this reclaim session
* @mode: One of the LRU isolation modes
- * @active: True [1] if isolating active pages
- * @file: True [1] if isolating file [!anon] pages
+ * @lru: LRU list id for isolating
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
- struct mem_cgroup_zone *mz, struct list_head *dst,
+ struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
- isolate_mode_t mode, int active, int file)
+ isolate_mode_t mode, enum lru_list lru)
{
- struct lruvec *lruvec;
- struct list_head *src;
+ struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
- unsigned long nr_lumpy_taken = 0;
- unsigned long nr_lumpy_dirty = 0;
- unsigned long nr_lumpy_failed = 0;
unsigned long scan;
- int lru = LRU_BASE;
-
- lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
- if (active)
- lru += LRU_ACTIVE;
- if (file)
- lru += LRU_FILE;
- src = &lruvec->lists[lru];
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
struct page *page;
- unsigned long pfn;
- unsigned long end_pfn;
- unsigned long page_pfn;
- int zone_id;
+ int nr_pages;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
VM_BUG_ON(!PageLRU(page));
- switch (__isolate_lru_page(page, mode, file)) {
+ switch (__isolate_lru_page(page, mode)) {
case 0:
- mem_cgroup_lru_del(page);
+ nr_pages = hpage_nr_pages(page);
+ mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
list_move(&page->lru, dst);
- nr_taken += hpage_nr_pages(page);
+ nr_taken += nr_pages;
break;
case -EBUSY:
@@ -1193,93 +1054,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
default:
BUG();
}
-
- if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
- continue;
-
- /*
- * Attempt to take all pages in the order aligned region
- * surrounding the tag page. Only take those pages of
- * the same active state as that tag page. We may safely
- * round the target page pfn down to the requested order
- * as the mem_map is guaranteed valid out to MAX_ORDER,
- * where that page is in a different zone we will detect
- * it from its zone id and abort this block scan.
- */
- zone_id = page_zone_id(page);
- page_pfn = page_to_pfn(page);
- pfn = page_pfn & ~((1 << sc->order) - 1);
- end_pfn = pfn + (1 << sc->order);
- for (; pfn < end_pfn; pfn++) {
- struct page *cursor_page;
-
- /* The target page is in the block, ignore it. */
- if (unlikely(pfn == page_pfn))
- continue;
-
- /* Avoid holes within the zone. */
- if (unlikely(!pfn_valid_within(pfn)))
- break;
-
- cursor_page = pfn_to_page(pfn);
-
- /* Check that we have not crossed a zone boundary. */
- if (unlikely(page_zone_id(cursor_page) != zone_id))
- break;
-
- /*
- * If we don't have enough swap space, reclaiming of
- * anon page which don't already have a swap slot is
- * pointless.
- */
- if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
- !PageSwapCache(cursor_page))
- break;
-
- if (__isolate_lru_page(cursor_page, mode, file) == 0) {
- unsigned int isolated_pages;
-
- mem_cgroup_lru_del(cursor_page);
- list_move(&cursor_page->lru, dst);
- isolated_pages = hpage_nr_pages(cursor_page);
- nr_taken += isolated_pages;
- nr_lumpy_taken += isolated_pages;
- if (PageDirty(cursor_page))
- nr_lumpy_dirty += isolated_pages;
- scan++;
- pfn += isolated_pages - 1;
- } else {
- /*
- * Check if the page is freed already.
- *
- * We can't use page_count() as that
- * requires compound_head and we don't
- * have a pin on the page here. If a
- * page is tail, we may or may not
- * have isolated the head, so assume
- * it's not free, it'd be tricky to
- * track the head status without a
- * page pin.
- */
- if (!PageTail(cursor_page) &&
- !atomic_read(&cursor_page->_count))
- continue;
- break;
- }
- }
-
- /* If we break out of the loop above, lumpy reclaim failed */
- if (pfn < end_pfn)
- nr_lumpy_failed++;
}
*nr_scanned = scan;
-
- trace_mm_vmscan_lru_isolate(sc->order,
- nr_to_scan, scan,
- nr_taken,
- nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
- mode, file);
+ trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+ nr_taken, mode, is_file_lru(lru));
return nr_taken;
}
@@ -1316,15 +1095,16 @@ int isolate_lru_page(struct page *page)
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
if (PageLRU(page)) {
int lru = page_lru(page);
- ret = 0;
get_page(page);
ClearPageLRU(page);
-
- del_page_from_lru_list(zone, page, lru);
+ del_page_from_lru_list(page, lruvec, lru);
+ ret = 0;
}
spin_unlock_irq(&zone->lru_lock);
}
@@ -1357,11 +1137,10 @@ static int too_many_isolated(struct zone *zone, int file,
}
static noinline_for_stack void
-putback_inactive_pages(struct mem_cgroup_zone *mz,
- struct list_head *page_list)
+putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
{
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
- struct zone *zone = mz->zone;
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ struct zone *zone = lruvec_zone(lruvec);
LIST_HEAD(pages_to_free);
/*
@@ -1379,9 +1158,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
spin_lock_irq(&zone->lru_lock);
continue;
}
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
+ add_page_to_lru_list(page, lruvec, lru);
+
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
@@ -1390,7 +1173,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
- del_page_from_lru_list(zone, page, lru);
+ del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
@@ -1407,112 +1190,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
list_splice(&pages_to_free, page_list);
}
-static noinline_for_stack void
-update_isolated_counts(struct mem_cgroup_zone *mz,
- struct list_head *page_list,
- unsigned long *nr_anon,
- unsigned long *nr_file)
-{
- struct zone *zone = mz->zone;
- unsigned int count[NR_LRU_LISTS] = { 0, };
- unsigned long nr_active = 0;
- struct page *page;
- int lru;
-
- /*
- * Count pages and clear active flags
- */
- list_for_each_entry(page, page_list, lru) {
- int numpages = hpage_nr_pages(page);
- lru = page_lru_base_type(page);
- if (PageActive(page)) {
- lru += LRU_ACTIVE;
- ClearPageActive(page);
- nr_active += numpages;
- }
- count[lru] += numpages;
- }
-
- preempt_disable();
- __count_vm_events(PGDEACTIVATE, nr_active);
-
- __mod_zone_page_state(zone, NR_ACTIVE_FILE,
- -count[LRU_ACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_INACTIVE_FILE,
- -count[LRU_INACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_ACTIVE_ANON,
- -count[LRU_ACTIVE_ANON]);
- __mod_zone_page_state(zone, NR_INACTIVE_ANON,
- -count[LRU_INACTIVE_ANON]);
-
- *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
- preempt_enable();
-}
-
-/*
- * Returns true if a direct reclaim should wait on pages under writeback.
- *
- * If we are direct reclaiming for contiguous pages and we do not reclaim
- * everything in the list, try again and wait for writeback IO to complete.
- * This will stall high-order allocations noticeably. Only do that when really
- * need to free the pages under high memory pressure.
- */
-static inline bool should_reclaim_stall(unsigned long nr_taken,
- unsigned long nr_freed,
- int priority,
- struct scan_control *sc)
-{
- int lumpy_stall_priority;
-
- /* kswapd should not stall on sync IO */
- if (current_is_kswapd())
- return false;
-
- /* Only stall on lumpy reclaim */
- if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
- return false;
-
- /* If we have reclaimed everything on the isolated list, no stall */
- if (nr_freed == nr_taken)
- return false;
-
- /*
- * For high-order allocations, there are two stall thresholds.
- * High-cost allocations stall immediately where as lower
- * order allocations such as stacks require the scanning
- * priority to be much higher before stalling.
- */
- if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- lumpy_stall_priority = DEF_PRIORITY;
- else
- lumpy_stall_priority = DEF_PRIORITY / 3;
-
- return priority <= lumpy_stall_priority;
-}
-
/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
static noinline_for_stack unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
- struct scan_control *sc, int priority, int file)
+shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(page_list);
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- unsigned long nr_anon;
- unsigned long nr_file;
unsigned long nr_dirty = 0;
unsigned long nr_writeback = 0;
- isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
- struct zone *zone = mz->zone;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+ isolate_mode_t isolate_mode = 0;
+ int file = is_file_lru(lru);
+ struct zone *zone = lruvec_zone(lruvec);
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,10 +1217,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
return SWAP_CLUSTER_MAX;
}
- set_reclaim_mode(priority, sc, false);
- if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
- isolate_mode |= ISOLATE_ACTIVE;
-
lru_add_drain();
if (!sc->may_unmap)
@@ -1535,47 +1226,43 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
spin_lock_irq(&zone->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
- sc, isolate_mode, 0, file);
+ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
+ &nr_scanned, sc, isolate_mode, lru);
+
+ __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
if (global_reclaim(sc)) {
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone,
- nr_scanned);
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
else
- __count_zone_vm_events(PGSCAN_DIRECT, zone,
- nr_scanned);
+ __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
}
spin_unlock_irq(&zone->lru_lock);
if (nr_taken == 0)
return 0;
- update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
-
- nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc,
&nr_dirty, &nr_writeback);
- /* Check if we should syncronously wait for writeback */
- if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
- set_reclaim_mode(priority, sc, true);
- nr_reclaimed += shrink_page_list(&page_list, mz, sc,
- priority, &nr_dirty, &nr_writeback);
- }
-
spin_lock_irq(&zone->lru_lock);
- reclaim_stat->recent_scanned[0] += nr_anon;
- reclaim_stat->recent_scanned[1] += nr_file;
+ reclaim_stat->recent_scanned[file] += nr_taken;
- if (current_is_kswapd())
- __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
- __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
+ if (global_reclaim(sc)) {
+ if (current_is_kswapd())
+ __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
+ nr_reclaimed);
+ else
+ __count_zone_vm_events(PGSTEAL_DIRECT, zone,
+ nr_reclaimed);
+ }
- putback_inactive_pages(mz, &page_list);
+ putback_inactive_pages(lruvec, &page_list);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
@@ -1604,14 +1291,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
* DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
* isolated page is PageWriteback
*/
- if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+ if (nr_writeback && nr_writeback >=
+ (nr_taken >> (DEF_PRIORITY - sc->priority)))
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
nr_scanned, nr_reclaimed,
- priority,
- trace_shrink_flags(file, sc->reclaim_mode));
+ sc->priority,
+ trace_shrink_flags(file));
return nr_reclaimed;
}
@@ -1633,30 +1321,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
* But we had to alter page->flags anyway.
*/
-static void move_active_pages_to_lru(struct zone *zone,
+static void move_active_pages_to_lru(struct lruvec *lruvec,
struct list_head *list,
struct list_head *pages_to_free,
enum lru_list lru)
{
+ struct zone *zone = lruvec_zone(lruvec);
unsigned long pgmoved = 0;
struct page *page;
+ int nr_pages;
while (!list_empty(list)) {
- struct lruvec *lruvec;
-
page = lru_to_page(list);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
- lruvec = mem_cgroup_lru_add_list(zone, page, lru);
+ nr_pages = hpage_nr_pages(page);
+ mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
- pgmoved += hpage_nr_pages(page);
+ pgmoved += nr_pages;
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
- del_page_from_lru_list(zone, page, lru);
+ del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
@@ -1672,9 +1362,9 @@ static void move_active_pages_to_lru(struct zone *zone,
}
static void shrink_active_list(unsigned long nr_to_scan,
- struct mem_cgroup_zone *mz,
+ struct lruvec *lruvec,
struct scan_control *sc,
- int priority, int file)
+ enum lru_list lru)
{
unsigned long nr_taken;
unsigned long nr_scanned;
@@ -1683,15 +1373,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned long nr_rotated = 0;
- isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
- struct zone *zone = mz->zone;
+ isolate_mode_t isolate_mode = 0;
+ int file = is_file_lru(lru);
+ struct zone *zone = lruvec_zone(lruvec);
lru_add_drain();
- reset_reclaim_mode(sc);
-
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
@@ -1699,18 +1388,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
spin_lock_irq(&zone->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
- isolate_mode, 1, file);
+ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ &nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
zone->pages_scanned += nr_scanned;
reclaim_stat->recent_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
- if (file)
- __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
- else
- __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
+ __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);
@@ -1732,7 +1418,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
}
- if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
+ if (page_referenced(page, 0, sc->target_mem_cgroup,
+ &vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
@@ -1765,10 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- move_active_pages_to_lru(zone, &l_active, &l_hold,
- LRU_ACTIVE + file * LRU_FILE);
- move_active_pages_to_lru(zone, &l_inactive, &l_hold,
- LRU_BASE + file * LRU_FILE);
+ move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
+ move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
@@ -1791,13 +1476,12 @@ static int inactive_anon_is_low_global(struct zone *zone)
/**
* inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @zone: zone to check
- * @sc: scan control of this context
+ * @lruvec: LRU vector to check
*
* Returns true if the zone does not have enough inactive anon pages,
* meaning some active anon pages need to be deactivated.
*/
-static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
+static int inactive_anon_is_low(struct lruvec *lruvec)
{
/*
* If we don't have swap space, anonymous page deactivation
@@ -1806,14 +1490,13 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
if (!total_swap_pages)
return 0;
- if (!scanning_global_lru(mz))
- return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
- mz->zone);
+ if (!mem_cgroup_disabled())
+ return mem_cgroup_inactive_anon_is_low(lruvec);
- return inactive_anon_is_low_global(mz->zone);
+ return inactive_anon_is_low_global(lruvec_zone(lruvec));
}
#else
-static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
+static inline int inactive_anon_is_low(struct lruvec *lruvec)
{
return 0;
}
@@ -1831,7 +1514,7 @@ static int inactive_file_is_low_global(struct zone *zone)
/**
* inactive_file_is_low - check if file pages need to be deactivated
- * @mz: memory cgroup and zone to check
+ * @lruvec: LRU vector to check
*
* When the system is doing streaming IO, memory pressure here
* ensures that active file pages get deactivated, until more
@@ -1843,44 +1526,39 @@ static int inactive_file_is_low_global(struct zone *zone)
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/
-static int inactive_file_is_low(struct mem_cgroup_zone *mz)
+static int inactive_file_is_low(struct lruvec *lruvec)
{
- if (!scanning_global_lru(mz))
- return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
- mz->zone);
+ if (!mem_cgroup_disabled())
+ return mem_cgroup_inactive_file_is_low(lruvec);
- return inactive_file_is_low_global(mz->zone);
+ return inactive_file_is_low_global(lruvec_zone(lruvec));
}
-static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
+static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
{
- if (file)
- return inactive_file_is_low(mz);
+ if (is_file_lru(lru))
+ return inactive_file_is_low(lruvec);
else
- return inactive_anon_is_low(mz);
+ return inactive_anon_is_low(lruvec);
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct mem_cgroup_zone *mz,
- struct scan_control *sc, int priority)
+ struct lruvec *lruvec, struct scan_control *sc)
{
- int file = is_file_lru(lru);
-
if (is_active_lru(lru)) {
- if (inactive_list_is_low(mz, file))
- shrink_active_list(nr_to_scan, mz, sc, priority, file);
+ if (inactive_list_is_low(lruvec, lru))
+ shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
- return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
+ return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}
-static int vmscan_swappiness(struct mem_cgroup_zone *mz,
- struct scan_control *sc)
+static int vmscan_swappiness(struct scan_control *sc)
{
if (global_reclaim(sc))
return vm_swappiness;
- return mem_cgroup_swappiness(mz->mem_cgroup);
+ return mem_cgroup_swappiness(sc->target_mem_cgroup);
}
/*
@@ -1891,17 +1569,18 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz,
*
* nr[0] = anon pages to scan; nr[1] = file pages to scan
*/
-static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
- unsigned long *nr, int priority)
+static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ unsigned long *nr)
{
unsigned long anon, file, free;
unsigned long anon_prio, file_prio;
unsigned long ap, fp;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2], denominator;
enum lru_list lru;
int noswap = 0;
bool force_scan = false;
+ struct zone *zone = lruvec_zone(lruvec);
/*
* If the zone or memcg is small, nr[l] can be 0. This
@@ -1913,7 +1592,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
* latencies, so it's better to scan a minimum amount there as
* well.
*/
- if (current_is_kswapd() && mz->zone->all_unreclaimable)
+ if (current_is_kswapd() && zone->all_unreclaimable)
force_scan = true;
if (!global_reclaim(sc))
force_scan = true;
@@ -1927,16 +1606,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
goto out;
}
- anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
- zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
- file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
- zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ get_lru_size(lruvec, LRU_INACTIVE_FILE);
if (global_reclaim(sc)) {
- free = zone_page_state(mz->zone, NR_FREE_PAGES);
+ free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
force-scan anon pages. */
- if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
+ if (unlikely(file + free <= high_wmark_pages(zone))) {
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
@@ -1948,8 +1627,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = vmscan_swappiness(mz, sc);
- file_prio = 200 - vmscan_swappiness(mz, sc);
+ anon_prio = vmscan_swappiness(sc);
+ file_prio = 200 - anon_prio;
/*
* OK, so we have swap space and a fair amount of page cache
@@ -1962,7 +1641,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
- spin_lock_irq(&mz->zone->lru_lock);
+ spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
@@ -1978,12 +1657,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
*/
- ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
+ ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
ap /= reclaim_stat->recent_rotated[0] + 1;
- fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
+ fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&mz->zone->lru_lock);
+ spin_unlock_irq(&zone->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
@@ -1993,9 +1672,9 @@ out:
int file = is_file_lru(lru);
unsigned long scan;
- scan = zone_nr_lru_pages(mz, lru);
- if (priority || noswap) {
- scan >>= priority;
+ scan = get_lru_size(lruvec, lru);
+ if (sc->priority || noswap || !vmscan_swappiness(sc)) {
+ scan >>= sc->priority;
if (!scan && force_scan)
scan = SWAP_CLUSTER_MAX;
scan = div64_u64(scan * fraction[file], denominator);
@@ -2004,14 +1683,25 @@ out:
}
}
+/* Use reclaim/compaction for costly allocs or under memory pressure */
+static bool in_reclaim_compaction(struct scan_control *sc)
+{
+ if (COMPACTION_BUILD && sc->order &&
+ (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
+ sc->priority < DEF_PRIORITY - 2))
+ return true;
+
+ return false;
+}
+
/*
- * Reclaim/compaction depends on a number of pages being freed. To avoid
- * disruption to the system, a small number of order-0 pages continue to be
- * rotated and reclaimed in the normal fashion. However, by the time we get
- * back to the allocator and call try_to_compact_zone(), we ensure that
- * there are enough free pages for it to be likely successful
+ * Reclaim/compaction is used for high-order allocation requests. It reclaims
+ * order-0 pages before compacting the zone. should_continue_reclaim() returns
+ * true if more pages should be reclaimed such that when the page allocator
+ * calls try_to_compact_zone() that it will have enough free pages to succeed.
+ * It will give up earlier than that if there is difficulty reclaiming pages.
*/
-static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
+static inline bool should_continue_reclaim(struct lruvec *lruvec,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
struct scan_control *sc)
@@ -2020,7 +1710,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
unsigned long inactive_lru_pages;
/* If not in reclaim/compaction mode, stop */
- if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+ if (!in_reclaim_compaction(sc))
return false;
/* Consider stopping depending on scan and reclaim activity */
@@ -2051,15 +1741,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+ inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
if (nr_swap_pages > 0)
- inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
+ inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */
- switch (compaction_suitable(mz->zone, sc->order)) {
+ switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
@@ -2071,8 +1761,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
- struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
@@ -2084,7 +1773,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
restart:
nr_reclaimed = 0;
nr_scanned = sc->nr_scanned;
- get_scan_count(mz, sc, nr, priority);
+ get_scan_count(lruvec, sc, nr);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2096,7 +1785,7 @@ restart:
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- mz, sc, priority);
+ lruvec, sc);
}
}
/*
@@ -2107,12 +1796,8 @@ restart:
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
- if (nr_reclaimed >= nr_to_reclaim)
- nr_to_reclaim = 0;
- else
- nr_to_reclaim -= nr_reclaimed;
-
- if (!nr_to_reclaim && priority < DEF_PRIORITY)
+ if (nr_reclaimed >= nr_to_reclaim &&
+ sc->priority < DEF_PRIORITY)
break;
}
blk_finish_plug(&plug);
@@ -2122,35 +1807,33 @@ restart:
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(mz))
- shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
+ if (inactive_anon_is_low(lruvec))
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
/* reclaim/compaction might need reclaim to continue */
- if (should_continue_reclaim(mz, nr_reclaimed,
- sc->nr_scanned - nr_scanned, sc))
+ if (should_continue_reclaim(lruvec, nr_reclaimed,
+ sc->nr_scanned - nr_scanned, sc))
goto restart;
throttle_vm_writeout(sc->gfp_mask);
}
-static void shrink_zone(int priority, struct zone *zone,
- struct scan_control *sc)
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
{
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
- .priority = priority,
+ .priority = sc->priority,
};
struct mem_cgroup *memcg;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
- struct mem_cgroup_zone mz = {
- .mem_cgroup = memcg,
- .zone = zone,
- };
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+
+ shrink_lruvec(lruvec, sc);
- shrink_mem_cgroup_zone(priority, &mz, sc);
/*
* Limit reclaim has historically picked one memcg and
* scanned it with decreasing priority levels until
@@ -2226,8 +1909,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
* the caller that it should consider retrying the allocation instead of
* further reclaim.
*/
-static bool shrink_zones(int priority, struct zonelist *zonelist,
- struct scan_control *sc)
+static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
@@ -2254,7 +1936,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
if (global_reclaim(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable &&
+ sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
if (COMPACTION_BUILD) {
/*
@@ -2286,7 +1969,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
/* need some check for avoid more shrink_zone() */
}
- shrink_zone(priority, zone, sc);
+ shrink_zone(zone, sc);
}
return aborted_reclaim;
@@ -2337,7 +2020,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc,
struct shrink_control *shrink)
{
- int priority;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
@@ -2350,11 +2032,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
if (global_reclaim(sc))
count_vm_event(ALLOCSTALL);
- for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+ do {
sc->nr_scanned = 0;
- if (!priority)
- disable_swap_token(sc->target_mem_cgroup);
- aborted_reclaim = shrink_zones(priority, zonelist, sc);
+ aborted_reclaim = shrink_zones(zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
@@ -2396,7 +2076,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&
- priority < DEF_PRIORITY - 2) {
+ sc->priority < DEF_PRIORITY - 2) {
struct zone *preferred_zone;
first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
@@ -2404,7 +2084,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
&preferred_zone);
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
}
- }
+ } while (--sc->priority >= 0);
out:
delayacct_freepages_end();
@@ -2442,6 +2122,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_unmap = 1,
.may_swap = 1,
.order = order,
+ .priority = DEF_PRIORITY,
.target_mem_cgroup = NULL,
.nodemask = nodemask,
};
@@ -2474,17 +2155,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !noswap,
.order = 0,
+ .priority = 0,
.target_mem_cgroup = memcg,
};
- struct mem_cgroup_zone mz = {
- .mem_cgroup = memcg,
- .zone = zone,
- };
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
sc.may_writepage,
sc.gfp_mask);
@@ -2495,7 +2174,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_mem_cgroup_zone(0, &mz, &sc);
+ shrink_lruvec(lruvec, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2516,6 +2195,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_swap = !noswap,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.order = 0,
+ .priority = DEF_PRIORITY,
.target_mem_cgroup = memcg,
.nodemask = NULL, /* we don't care the placement */
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2546,8 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
}
#endif
-static void age_active_anon(struct zone *zone, struct scan_control *sc,
- int priority)
+static void age_active_anon(struct zone *zone, struct scan_control *sc)
{
struct mem_cgroup *memcg;
@@ -2556,14 +2235,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc,
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- struct mem_cgroup_zone mz = {
- .mem_cgroup = memcg,
- .zone = zone,
- };
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- if (inactive_anon_is_low(&mz))
- shrink_active_list(SWAP_CLUSTER_MAX, &mz,
- sc, priority, 0);
+ if (inactive_anon_is_low(lruvec))
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
memcg = mem_cgroup_iter(NULL, memcg, NULL);
} while (memcg);
@@ -2672,7 +2348,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
{
int all_zones_ok;
unsigned long balanced;
- int priority;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
@@ -2696,18 +2371,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
};
loop_again:
total_scanned = 0;
+ sc.priority = DEF_PRIORITY;
sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
- for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+ do {
unsigned long lru_pages = 0;
int has_under_min_watermark_zone = 0;
- /* The swap token gets in the way of swapout... */
- if (!priority)
- disable_swap_token(NULL);
-
all_zones_ok = 1;
balanced = 0;
@@ -2721,14 +2393,15 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable &&
+ sc.priority != DEF_PRIORITY)
continue;
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/
- age_active_anon(zone, &sc, priority);
+ age_active_anon(zone, &sc);
/*
* If the number of buffer_heads in the machine
@@ -2776,7 +2449,8 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable &&
+ sc.priority != DEF_PRIORITY)
continue;
sc.nr_scanned = 0;
@@ -2820,7 +2494,7 @@ loop_again:
!zone_watermark_ok_safe(zone, testorder,
high_wmark_pages(zone) + balance_gap,
end_zone, 0)) {
- shrink_zone(priority, zone, &sc);
+ shrink_zone(zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
@@ -2877,7 +2551,7 @@ loop_again:
* OK, kswapd is getting into trouble. Take a nap, then take
* another pass across the zones.
*/
- if (total_scanned && (priority < DEF_PRIORITY - 2)) {
+ if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
if (has_under_min_watermark_zone)
count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
else
@@ -2892,7 +2566,7 @@ loop_again:
*/
if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
break;
- }
+ } while (--sc.priority >= 0);
out:
/*
@@ -2942,7 +2616,8 @@ out:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable &&
+ sc.priority != DEF_PRIORITY)
continue;
/* Would compaction fail due to lack of free memory? */
@@ -3209,6 +2884,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.nr_to_reclaim = nr_to_reclaim,
.hibernation_mode = 1,
.order = 0,
+ .priority = DEF_PRIORITY,
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
@@ -3386,7 +3062,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
- int priority;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -3395,6 +3070,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
.order = order,
+ .priority = ZONE_RECLAIM_PRIORITY,
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
@@ -3417,11 +3093,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
- priority = ZONE_RECLAIM_PRIORITY;
do {
- shrink_zone(priority, zone, &sc);
- priority--;
- } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
+ shrink_zone(zone, &sc);
+ } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -3536,7 +3210,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
if (mapping_unevictable(page_mapping(page)))
return 0;
- if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+ if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
return 0;
return 1;
@@ -3572,6 +3246,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
+ lruvec = mem_cgroup_page_lruvec(page, zone);
if (!PageLRU(page) || !PageUnevictable(page))
continue;
@@ -3581,11 +3256,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
VM_BUG_ON(PageActive(page));
ClearPageUnevictable(page);
- __dec_zone_state(zone, NR_UNEVICTABLE);
- lruvec = mem_cgroup_lru_move_lists(zone, page,
- LRU_UNEVICTABLE, lru);
- list_move(&page->lru, &lruvec->lists[lru]);
- __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
+ del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
+ add_page_to_lru_list(page, lruvec, lru);
pgrescued++;
}
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f600557a765..1bbbbd9776a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -613,6 +613,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
"Reclaimable",
"Movable",
"Reserve",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
"Isolate",
};
@@ -738,7 +741,8 @@ const char * const vmstat_text[] = {
"pgmajfault",
TEXTS_FOR_ZONES("pgrefill")
- TEXTS_FOR_ZONES("pgsteal")
+ TEXTS_FOR_ZONES("pgsteal_kswapd")
+ TEXTS_FOR_ZONES("pgsteal_direct")
TEXTS_FOR_ZONES("pgscan_kswapd")
TEXTS_FOR_ZONES("pgscan_direct")
@@ -747,7 +751,6 @@ const char * const vmstat_text[] = {
#endif
"pginodesteal",
"slabs_scanned",
- "kswapd_steal",
"kswapd_inodesteal",
"kswapd_low_wmark_hit_quickly",
"kswapd_high_wmark_hit_quickly",
@@ -1220,7 +1223,6 @@ module_init(setup_vmstat)
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
#include <linux/debugfs.h>
-static struct dentry *extfrag_debug_root;
/*
* Return an index indicating how much of the available free memory is
@@ -1358,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = {
static int __init extfrag_debug_init(void)
{
+ struct dentry *extfrag_debug_root;
+
extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
if (!extfrag_debug_root)
return -ENOMEM;
if (!debugfs_create_file("unusable_index", 0444,
extfrag_debug_root, NULL, &unusable_file_ops))
- return -ENOMEM;
+ goto fail;
if (!debugfs_create_file("extfrag_index", 0444,
extfrag_debug_root, NULL, &extfrag_file_ops))
- return -ENOMEM;
+ goto fail;
return 0;
+fail:
+ debugfs_remove_recursive(extfrag_debug_root);
+ return -ENOMEM;
}
module_init(extfrag_debug_init);