summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/bounce.c8
-rw-r--r--mm/compaction.c147
-rw-r--r--mm/frontswap.c344
-rw-r--r--mm/internal.h9
-rw-r--r--mm/madvise.c18
-rw-r--r--mm/memblock.c115
-rw-r--r--mm/memcontrol.c6
-rw-r--r--mm/memory-failure.c14
-rw-r--r--mm/memory.c21
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c5
-rw-r--r--mm/nobootmem.c40
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c21
-rw-r--r--mm/page_alloc.c15
-rw-r--r--mm/page_cgroup.c4
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c1
-rw-r--r--mm/shmem.c253
-rw-r--r--mm/sparse.c20
-rw-r--r--mm/swapfile.c66
-rw-r--r--mm/vmalloc.c36
-rw-r--r--mm/vmscan.c17
28 files changed, 776 insertions, 427 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98..82fed4eb2b6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -389,3 +389,20 @@ config CLEANCACHE
in a negligible performance hit.
If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+ bool "Enable frontswap to cache swap pages if tmem is present"
+ depends on SWAP
+ default n
+ help
+ Frontswap is so named because it can be thought of as the opposite
+ of a "backing" store for a swap device. The data is stored into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. When space in transcendent memory is available,
+ a significant swap I/O reduction may be achieved. When none is
+ available, all frontswap calls are reduced to a single pointer-
+ compare-against-NULL resulting in a negligible performance hit
+ and swap data is stored as normal on the matching swap device.
+
+ If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88..2e2fbbefb99 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ec4fcb7a56c..bcb63ac48cc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, limit);
}
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
@@ -710,6 +710,10 @@ again:
if (ptr)
return ptr;
+ /* do not panic in alloc_bootmem_bdata() */
+ if (limit && goal + size > limit)
+ limit = 0;
+
ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
if (ptr)
return ptr;
diff --git a/mm/bounce.c b/mm/bounce.c
index d1be02ca188..04208677556 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -24,23 +24,25 @@
static mempool_t *page_pool, *isa_page_pool;
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
static __init int init_emergency_pool(void)
{
-#ifndef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
if (max_pfn <= max_low_pfn)
return 0;
#endif
page_pool = mempool_create_page_pool(POOL_SIZE, 0);
BUG_ON(!page_pool);
- printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+ printk("bounce pool size: %d pages\n", POOL_SIZE);
return 0;
}
__initcall(init_emergency_pool);
+#endif
+#ifdef CONFIG_HIGHMEM
/*
* highmem version, map in to vec
*/
diff --git a/mm/compaction.c b/mm/compaction.c
index 4ac338af512..2f42d952853 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -236,7 +236,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
- if (cc->mode != COMPACT_SYNC)
+ if (!cc->sync)
return 0;
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -304,8 +304,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
* satisfies the allocation
*/
pageblock_nr = low_pfn >> pageblock_order;
- if (cc->mode != COMPACT_SYNC &&
- last_pageblock_nr != pageblock_nr &&
+ if (!cc->sync && last_pageblock_nr != pageblock_nr &&
!migrate_async_suitable(get_pageblock_migratetype(page))) {
low_pfn += pageblock_nr_pages;
low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
@@ -326,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
- if (cc->mode != COMPACT_SYNC)
+ if (!cc->sync)
mode |= ISOLATE_ASYNC_MIGRATE;
lruvec = mem_cgroup_page_lruvec(page, zone);
@@ -361,90 +360,27 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
-/*
- * Returns true if MIGRATE_UNMOVABLE pageblock was successfully
- * converted to MIGRATE_MOVABLE type, false otherwise.
- */
-static bool rescue_unmovable_pageblock(struct page *page)
-{
- unsigned long pfn, start_pfn, end_pfn;
- struct page *start_page, *end_page;
-
- pfn = page_to_pfn(page);
- start_pfn = pfn & ~(pageblock_nr_pages - 1);
- end_pfn = start_pfn + pageblock_nr_pages;
-
- start_page = pfn_to_page(start_pfn);
- end_page = pfn_to_page(end_pfn);
-
- /* Do not deal with pageblocks that overlap zones */
- if (page_zone(start_page) != page_zone(end_page))
- return false;
-
- for (page = start_page, pfn = start_pfn; page < end_page; pfn++,
- page++) {
- if (!pfn_valid_within(pfn))
- continue;
-
- if (PageBuddy(page)) {
- int order = page_order(page);
-
- pfn += (1 << order) - 1;
- page += (1 << order) - 1;
-
- continue;
- } else if (page_count(page) == 0 || PageLRU(page))
- continue;
-
- return false;
- }
-
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE);
- return true;
-}
-
-enum smt_result {
- GOOD_AS_MIGRATION_TARGET,
- FAIL_UNMOVABLE_TARGET,
- FAIL_BAD_TARGET,
-};
-/*
- * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block
- * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page
- * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise.
- */
-static enum smt_result suitable_migration_target(struct page *page,
- struct compact_control *cc)
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
{
int migratetype = get_pageblock_migratetype(page);
/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
- return FAIL_BAD_TARGET;
+ return false;
/* If the page is a large free page, then allow migration */
if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return GOOD_AS_MIGRATION_TARGET;
+ return true;
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (cc->mode != COMPACT_ASYNC_UNMOVABLE &&
- migrate_async_suitable(migratetype))
- return GOOD_AS_MIGRATION_TARGET;
-
- if (cc->mode == COMPACT_ASYNC_MOVABLE &&
- migratetype == MIGRATE_UNMOVABLE)
- return FAIL_UNMOVABLE_TARGET;
-
- if (cc->mode != COMPACT_ASYNC_MOVABLE &&
- migratetype == MIGRATE_UNMOVABLE &&
- rescue_unmovable_pageblock(page))
- return GOOD_AS_MIGRATION_TARGET;
+ if (migrate_async_suitable(migratetype))
+ return true;
/* Otherwise skip the block */
- return FAIL_BAD_TARGET;
+ return false;
}
/*
@@ -478,13 +414,6 @@ static void isolate_freepages(struct zone *zone,
zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
/*
- * isolate_freepages() may be called more than once during
- * compact_zone_order() run and we want only the most recent
- * count.
- */
- cc->nr_pageblocks_skipped = 0;
-
- /*
* Isolate free pages until enough are available to migrate the
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
@@ -492,7 +421,6 @@ static void isolate_freepages(struct zone *zone,
for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
- enum smt_result ret;
if (!pfn_valid(pfn))
continue;
@@ -509,12 +437,9 @@ static void isolate_freepages(struct zone *zone,
continue;
/* Check the block is suitable for migration */
- ret = suitable_migration_target(page, cc);
- if (ret != GOOD_AS_MIGRATION_TARGET) {
- if (ret == FAIL_UNMOVABLE_TARGET)
- cc->nr_pageblocks_skipped++;
+ if (!suitable_migration_target(page))
continue;
- }
+
/*
* Found a block suitable for isolating free pages from. Now
* we disabled interrupts, double check things are ok and
@@ -523,14 +448,12 @@ static void isolate_freepages(struct zone *zone,
*/
isolated = 0;
spin_lock_irqsave(&zone->lock, flags);
- ret = suitable_migration_target(page, cc);
- if (ret == GOOD_AS_MIGRATION_TARGET) {
+ if (suitable_migration_target(page)) {
end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
isolated = isolate_freepages_block(pfn, end_pfn,
freelist, false);
nr_freepages += isolated;
- } else if (ret == FAIL_UNMOVABLE_TARGET)
- cc->nr_pageblocks_skipped++;
+ }
spin_unlock_irqrestore(&zone->lock, flags);
/*
@@ -762,9 +685,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
- (unsigned long)&cc->freepages, false,
- (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT
- : MIGRATE_ASYNC);
+ (unsigned long)cc, false,
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
@@ -779,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
if (err) {
putback_lru_pages(&cc->migratepages);
cc->nr_migratepages = 0;
+ if (err == -ENOMEM) {
+ ret = COMPACT_PARTIAL;
+ goto out;
+ }
}
-
}
out:
@@ -793,8 +718,7 @@ out:
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
- enum compact_mode mode,
- unsigned long *nr_pageblocks_skipped)
+ bool sync)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -802,17 +726,12 @@ static unsigned long compact_zone_order(struct zone *zone,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
- .mode = mode,
+ .sync = sync,
};
- unsigned long rc;
-
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- rc = compact_zone(zone, &cc);
- *nr_pageblocks_skipped = cc.nr_pageblocks_skipped;
-
- return rc;
+ return compact_zone(zone, &cc);
}
int sysctl_extfrag_threshold = 500;
@@ -837,8 +756,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
- unsigned long nr_pageblocks_skipped;
- enum compact_mode mode;
/*
* Check whether it is worth even starting compaction. The order check is
@@ -855,22 +772,12 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
nodemask) {
int status;
- mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE;
-retry:
- status = compact_zone_order(zone, order, gfp_mask, mode,
- &nr_pageblocks_skipped);
+ status = compact_zone_order(zone, order, gfp_mask, sync);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
break;
-
- if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) {
- if (nr_pageblocks_skipped) {
- mode = COMPACT_ASYNC_UNMOVABLE;
- goto retry;
- }
- }
}
return rc;
@@ -904,7 +811,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
if (ok && cc->order > zone->compact_order_failed)
zone->compact_order_failed = cc->order + 1;
/* Currently async compaction is never deferred. */
- else if (!ok && cc->mode == COMPACT_SYNC)
+ else if (!ok && cc->sync)
defer_compaction(zone, cc->order);
}
@@ -919,7 +826,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
{
struct compact_control cc = {
.order = order,
- .mode = COMPACT_ASYNC_MOVABLE,
+ .sync = false,
};
return __compact_pgdat(pgdat, &cc);
@@ -929,7 +836,7 @@ static int compact_node(int nid)
{
struct compact_control cc = {
.order = -1,
- .mode = COMPACT_SYNC,
+ .sync = true,
};
return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 00000000000..6b3e71a2cd4
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,344 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap. See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+bool frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/*
+ * If enabled, frontswap_store will return failure even on success. As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache. In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured). These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+
+static inline void inc_frontswap_loads(void) {
+ frontswap_loads++;
+}
+static inline void inc_frontswap_succ_stores(void) {
+ frontswap_succ_stores++;
+}
+static inline void inc_frontswap_failed_stores(void) {
+ frontswap_failed_stores++;
+}
+static inline void inc_frontswap_invalidates(void) {
+ frontswap_invalidates++;
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+ struct frontswap_ops old = frontswap_ops;
+
+ frontswap_ops = *ops;
+ frontswap_enabled = true;
+ return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+ frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops.init(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+ frontswap_clear(sis, offset);
+ atomic_dec(&sis->frontswap_pages);
+}
+
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset. Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implementation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+ int ret = -1, dup = 0;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ dup = 1;
+ ret = frontswap_ops.store(type, offset, page);
+ if (ret == 0) {
+ frontswap_set(sis, offset);
+ inc_frontswap_succ_stores();
+ if (!dup)
+ atomic_inc(&sis->frontswap_pages);
+ } else {
+ /*
+ failed dup always results in automatic invalidate of
+ the (older) page from frontswap
+ */
+ inc_frontswap_failed_stores();
+ if (dup)
+ __frontswap_clear(sis, offset);
+ }
+ if (frontswap_writethrough_enabled)
+ /* report failure so swap also writes to swap device */
+ ret = -1;
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+ int ret = -1;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ ret = frontswap_ops.load(type, offset, page);
+ if (ret == 0)
+ inc_frontswap_loads();
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset)) {
+ frontswap_ops.invalidate_page(type, offset);
+ __frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
+ }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops.invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+static unsigned long __frontswap_curr_pages(void)
+{
+ int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ assert_spin_locked(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ totalpages += atomic_read(&si->frontswap_pages);
+ }
+ return totalpages;
+}
+
+static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ int *swapid)
+{
+ int ret = -EINVAL;
+ struct swap_info_struct *si = NULL;
+ int si_frontswap_pages;
+ unsigned long total_pages_to_unuse = total;
+ unsigned long pages = 0, pages_to_unuse = 0;
+ int type;
+
+ assert_spin_locked(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages) {
+ pages = pages_to_unuse = total_pages_to_unuse;
+ } else {
+ pages = si_frontswap_pages;
+ pages_to_unuse = 0; /* unuse all */
+ }
+ /* ensure there is enough RAM to fetch pages from frontswap */
+ if (security_vm_enough_memory_mm(current->mm, pages)) {
+ ret = -ENOMEM;
+ continue;
+ }
+ vm_unacct_memory(pages);
+ *unused = pages_to_unuse;
+ *swapid = type;
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+static int __frontswap_shrink(unsigned long target_pages,
+ unsigned long *pages_to_unuse,
+ int *type)
+{
+ unsigned long total_pages = 0, total_pages_to_unuse;
+
+ assert_spin_locked(&swap_lock);
+
+ total_pages = __frontswap_curr_pages();
+ if (total_pages <= target_pages) {
+ /* Nothing to do */
+ *pages_to_unuse = 0;
+ return 0;
+ }
+ total_pages_to_unuse = total_pages - target_pages;
+ return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
+}
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+ unsigned long pages_to_unuse = 0;
+ int type, ret;
+
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_list.head each time
+ */
+ spin_lock(&swap_lock);
+ ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+ spin_unlock(&swap_lock);
+ if (ret == 0 && pages_to_unuse)
+ try_to_unuse(type, true, pages_to_unuse);
+ return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices. This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+ unsigned long totalpages = 0;
+
+ spin_lock(&swap_lock);
+ totalpages = __frontswap_curr_pages();
+ spin_unlock(&swap_lock);
+
+ return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("frontswap", NULL);
+ if (root == NULL)
+ return -ENXIO;
+ debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
+ debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
+ debugfs_create_u64("failed_stores", S_IRUGO, root,
+ &frontswap_failed_stores);
+ debugfs_create_u64("invalidates", S_IRUGO,
+ root, &frontswap_invalidates);
+#endif
+ return 0;
+}
+
+module_init(init_frontswap);
diff --git a/mm/internal.h b/mm/internal.h
index 5cbb7819004..2ba87fbfb75 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -94,9 +94,6 @@ extern void putback_lru_page(struct page *page);
/*
* in mm/page_alloc.c
*/
-extern void set_pageblock_migratetype(struct page *page, int migratetype);
-extern int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype);
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
@@ -104,7 +101,6 @@ extern bool is_free_buddy_page(struct page *page);
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-#include <linux/compaction.h>
/*
* in mm/compaction.c
@@ -123,14 +119,11 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
- enum compact_mode mode; /* Compaction mode */
+ bool sync; /* Synchronous migration */
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
-
- /* Number of UNMOVABLE destination pageblocks skipped during scan */
- unsigned long nr_pageblocks_skipped;
};
unsigned long
diff --git a/mm/madvise.c b/mm/madvise.c
index deff1b64a08..14d260fa0d1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/fs.h>
+#include <linux/file.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma,
{
loff_t offset;
int error;
+ struct file *f;
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
return -EINVAL;
- if (!vma->vm_file || !vma->vm_file->f_mapping
- || !vma->vm_file->f_mapping->host) {
+ f = vma->vm_file;
+
+ if (!f || !f->f_mapping || !f->f_mapping->host) {
return -EINVAL;
}
@@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma,
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- /* filesystem's fallocate may need to take i_mutex */
+ /*
+ * Filesystem's fallocate may need to take i_mutex. We need to
+ * explicitly grab a reference because the vma (and hence the
+ * vma's reference to the file) can go away as soon as we drop
+ * mmap_sem.
+ */
+ get_file(f);
up_read(&current->mm->mmap_sem);
- error = do_fallocate(vma->vm_file,
+ error = do_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
+ fput(f);
down_read(&current->mm->mmap_sem);
return error;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 952123eba43..5cc6731b00c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
MAX_NUMNODES);
}
-/*
- * Free memblock.reserved.regions
- */
-int __init_memblock memblock_free_reserved_regions(void)
-{
- if (memblock.reserved.regions == memblock_reserved_init_regions)
- return 0;
-
- return memblock_free(__pa(memblock.reserved.regions),
- sizeof(struct memblock_region) * memblock.reserved.max);
-}
-
-/*
- * Reserve memblock.reserved.regions
- */
-int __init_memblock memblock_reserve_reserved_regions(void)
-{
- if (memblock.reserved.regions == memblock_reserved_init_regions)
- return 0;
-
- return memblock_reserve(__pa(memblock.reserved.regions),
- sizeof(struct memblock_region) * memblock.reserved.max);
-}
-
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
type->total_size -= type->regions[r].size;
@@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
}
}
-static int __init_memblock memblock_double_array(struct memblock_type *type)
+phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
+ phys_addr_t *addr)
+{
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
+
+ *addr = __pa(memblock.reserved.regions);
+
+ return PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+}
+
+/**
+ * memblock_double_array - double the size of the memblock regions array
+ * @type: memblock type of the regions array being doubled
+ * @new_area_start: starting address of memory range to avoid overlap with
+ * @new_area_size: size of memory range to avoid overlap with
+ *
+ * Double the size of the @type regions array. If memblock is being used to
+ * allocate memory for a new reserved regions array and there is a previously
+ * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * waiting to be reserved, ensure the memory used by the new array does
+ * not overlap.
+ *
+ * RETURNS:
+ * 0 on success, -1 on failure.
+ */
+static int __init_memblock memblock_double_array(struct memblock_type *type,
+ phys_addr_t new_area_start,
+ phys_addr_t new_area_size)
{
struct memblock_region *new_array, *old_array;
+ phys_addr_t old_alloc_size, new_alloc_size;
phys_addr_t old_size, new_size, addr;
int use_slab = slab_is_available();
int *in_slab;
@@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
/* Calculate new doubled size */
old_size = type->max * sizeof(struct memblock_region);
new_size = old_size << 1;
+ /*
+ * We need to allocated new one align to PAGE_SIZE,
+ * so we can free them completely later.
+ */
+ old_alloc_size = PAGE_ALIGN(old_size);
+ new_alloc_size = PAGE_ALIGN(new_size);
/* Retrieve the slab flag */
if (type == &memblock.memory)
@@ -222,7 +234,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
new_array = kmalloc(new_size, GFP_KERNEL);
addr = new_array ? __pa(new_array) : 0;
} else {
- addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+ /* only exclude range when trying to double reserved.regions */
+ if (type != &memblock.reserved)
+ new_area_start = new_area_size = 0;
+
+ addr = memblock_find_in_range(new_area_start + new_area_size,
+ memblock.current_limit,
+ new_alloc_size, PAGE_SIZE);
+ if (!addr && new_area_size)
+ addr = memblock_find_in_range(0,
+ min(new_area_start, memblock.current_limit),
+ new_alloc_size, PAGE_SIZE);
+
new_array = addr ? __va(addr) : 0;
}
if (!addr) {
@@ -251,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
kfree(old_array);
else if (old_array != memblock_memory_init_regions &&
old_array != memblock_reserved_init_regions)
- memblock_free(__pa(old_array), old_size);
+ memblock_free(__pa(old_array), old_alloc_size);
/* Reserve the new array if that comes from the memblock.
* Otherwise, we needn't do it
*/
if (!use_slab)
- BUG_ON(memblock_reserve(addr, new_size));
+ BUG_ON(memblock_reserve(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
@@ -399,7 +422,7 @@ repeat:
*/
if (!insert) {
while (type->cnt + nr_new > type->max)
- if (memblock_double_array(type) < 0)
+ if (memblock_double_array(type, obase, size) < 0)
return -ENOMEM;
insert = true;
goto repeat;
@@ -450,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
/* we'll create at most two more regions */
while (type->cnt + 2 > type->max)
- if (memblock_double_array(type) < 0)
+ if (memblock_double_array(type, base, size) < 0)
return -ENOMEM;
for (i = 0; i < type->cnt; i++) {
@@ -540,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
* __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
*
* Find the first free area from *@idx which matches @nid, fill the out
* parameters, and update *@idx for the next iteration. The lower 32bit of
@@ -616,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
* __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
*
* Reverse of __next_free_mem_range().
*/
@@ -867,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
return memblock_search(&memblock.memory, addr) != -1;
}
+/**
+ * memblock_is_region_memory - check if a region is a subset of memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) is a subset of a memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
{
int idx = memblock_search(&memblock.memory, base);
@@ -879,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
memblock.memory.regions[idx].size) >= end;
}
+/**
+ * memblock_is_region_reserved - check if a region intersects reserved memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac35bccadb7..f72b5e52451 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
{
if (root_memcg == memcg)
return true;
- if (!root_memcg->use_hierarchy)
+ if (!root_memcg->use_hierarchy || !memcg)
return false;
return css_is_ancestor(&memcg->css, &root_memcg->css);
}
@@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
- * @mem: the memory cgroup
+ * @memcg: the memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
* pages.
@@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
/**
* test_mem_cgroup_node_reclaimable
- * @mem: the target memcg
+ * @memcg: the target memcg
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ab1e7145e29..de4ce705845 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
-static void kill_procs(struct list_head *to_kill, int doit, int trapno,
+static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
int fail, struct page *page, unsigned long pfn,
int flags)
{
struct to_kill *tk, *next;
list_for_each_entry_safe (tk, next, to_kill, nd) {
- if (doit) {
+ if (forcekill) {
/*
* In case something went wrong with munmapping
* make sure the process doesn't catch the
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int kill = 1;
+ int kill = 1, forcekill;
struct page *hpage = compound_head(p);
struct page *ppage;
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* be called inside page lock (it's recommended but not enforced).
*/
mapping = page_mapping(hpage);
- if (!PageDirty(hpage) && mapping &&
+ if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
mapping_cap_writeback_dirty(mapping)) {
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
* killing is needed or not. Only kill when the page
- * was dirty, otherwise the tokill list is merely
+ * was dirty or the process is not restartable,
+ * otherwise the tokill list is merely
* freed. When there was a problem unmapping earlier
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs(&tokill, !!PageDirty(ppage), trapno,
+ forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+ kill_procs(&tokill, forcekill, trapno,
ret != SWAP_SUCCESS, p, pfn, flags);
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 1b7dc662bf9..91f69459d3e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
tlb->mm = mm;
tlb->fullmm = fullmm;
+ tlb->start = -1UL;
+ tlb->end = 0;
tlb->need_flush = 0;
tlb->fast_mode = (num_possible_cpus() == 1);
tlb->local.next = NULL;
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
{
struct mmu_gather_batch *batch, *next;
+ tlb->start = start;
+ tlb->end = end;
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -1204,6 +1208,11 @@ again:
*/
if (force_flush) {
force_flush = 0;
+
+#ifdef HAVE_GENERIC_MMU_GATHER
+ tlb->start = addr;
+ tlb->end = end;
+#endif
tlb_flush_mmu(tlb);
if (addr != end)
goto again;
@@ -1225,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
- VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+#ifdef CONFIG_DEBUG_VM
+ if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
+ pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+ __func__, addr, end,
+ vma->vm_start,
+ vma->vm_end);
+ BUG();
+ }
+#endif
split_huge_page_pmd(vma->vm_mm, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
@@ -1366,7 +1383,7 @@ void unmap_vmas(struct mmu_gather *tlb,
/**
* zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
+ * @start: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d7e3ec8e0f..427bb291dd0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -618,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
pgdat = hotadd_new_pgdat(nid, start);
ret = -ENOMEM;
if (!pgdat)
- goto out;
+ goto error;
new_pgdat = 1;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f15c1b24ca1..1d771e4200d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma,
- false, true);
+ false, MIGRATE_SYNC);
if (nr_failed)
putback_lru_pages(&pagelist);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index ab81d482ae6..be26d5cbe56 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -436,7 +436,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* is actually a signal that all of the page has become dirty.
* Whereas only part of our page may be dirty.
*/
- __set_page_dirty_nobuffers(newpage);
+ if (PageSwapBacked(page))
+ SetPageDirty(newpage);
+ else
+ __set_page_dirty_nobuffers(newpage);
}
mlock_migrate_page(newpage, page);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index d23415c001b..405573010f9 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
__free_pages_bootmem(pfn_to_page(i), 0);
}
+static unsigned long __init __free_memory_core(phys_addr_t start,
+ phys_addr_t end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = min_t(unsigned long,
+ PFN_DOWN(end), max_low_pfn);
+
+ if (start_pfn > end_pfn)
+ return 0;
+
+ __free_pages_memory(start_pfn, end_pfn);
+
+ return end_pfn - start_pfn;
+}
+
unsigned long __init free_low_memory_core_early(int nodeid)
{
unsigned long count = 0;
- phys_addr_t start, end;
+ phys_addr_t start, end, size;
u64 i;
- /* free reserved array temporarily so that it's treated as free area */
- memblock_free_reserved_regions();
-
- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
- unsigned long start_pfn = PFN_UP(start);
- unsigned long end_pfn = min_t(unsigned long,
- PFN_DOWN(end), max_low_pfn);
- if (start_pfn < end_pfn) {
- __free_pages_memory(start_pfn, end_pfn);
- count += end_pfn - start_pfn;
- }
- }
+ for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
+ count += __free_memory_core(start, end);
+
+ /* free range that is used for reserved array if we allocate it */
+ size = get_allocated_memblock_reserved_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
- /* put region array back? */
- memblock_reserve_reserved_regions();
return count;
}
@@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, limit);
}
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
diff --git a/mm/nommu.c b/mm/nommu.c
index c4acfbc0997..d4b0c10872d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed0e1967736..ac300c99baf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p,
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages)
{
- unsigned long points;
+ long points;
+ long adj;
if (oom_unkillable_task(p, memcg, nodemask))
return 0;
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
if (!p)
return 0;
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ adj = p->signal->oom_score_adj;
+ if (adj == OOM_SCORE_ADJ_MIN) {
task_unlock(p);
return 0;
}
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- points -= 30 * totalpages / 1000;
+ adj -= 30;
- /*
- * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
- * either completely disable oom killing or always prefer a certain
- * task.
- */
- points += p->signal->oom_score_adj * totalpages / 1000;
+ /* Normalize to oom_score_adj units */
+ adj *= totalpages / 1000;
+ points += adj;
/*
* Never return 0 for an eligible task regardless of the root bonus and
* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
*/
- return points ? points : 1;
+ return points > 0 ? points : 1;
}
/*
@@ -366,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
/**
* dump_tasks - dump current memory state of all system tasks
- * @mem: current's memory controller, if constrained
+ * @memcg: current's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* Dumps the current memory state of all eligible tasks. Tasks not in the same
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6092f331b32..4a4f9219683 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
-void set_pageblock_migratetype(struct page *page, int migratetype)
+static void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled))
@@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone,
return pages_moved;
}
-int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype)
+static int move_freepages_block(struct zone *zone, struct page *page,
+ int migratetype)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
@@ -5635,7 +5635,12 @@ static struct page *
__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
int **resultp)
{
- return alloc_page(GFP_HIGHUSER_MOVABLE);
+ gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ return alloc_page(gfp_mask);
}
/* [start, end) must belong to a single zone. */
@@ -5651,7 +5656,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
.nr_migratepages = 0,
.order = -1,
.zone = page_zone(pfn_to_page(start)),
- .mode = COMPACT_SYNC,
+ .sync = true,
};
INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059..eb750f85139 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
/**
* swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @end: swap entry to be cmpxchged
+ * @ent: swap entry to be cmpxchged
* @old: old id
* @new: new id
*
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
/**
* swap_cgroup_record - record mem_cgroup for this swp_entry.
* @ent: swap entry to be recorded into
- * @mem: mem_cgroup to be recorded
+ * @id: mem_cgroup to be recorded
*
* Returns old value at success, 0 at failure.
* (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611..34f02923744 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
+#include <linux/frontswap.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
+ if (frontswap_store(page) == 0) {
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ goto out;
+ }
bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageUptodate(page));
+ if (frontswap_load(page) == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto out;
+ }
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e1271..6c118d012bb 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
/**
* walk_page_range - walk a memory map's page tables with a callback
- * @mm: memory map to walk
* @addr: starting address
* @end: ending address
* @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c..3707c71ae4c 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
* @chunk: chunk to depopulate
* @off: offset to the area to depopulate
* @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk. If @flush is true, vcache is flushed before unmapping
diff --git a/mm/shmem.c b/mm/shmem.c
index 585bd220a21..c15b998e5a8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
}
/*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
+ *
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
+ */
+static bool shmem_confirm_swap(struct address_space *mapping,
+ pgoff_t index, swp_entry_t swap)
+{
+ void *item;
+
+ rcu_read_lock();
+ item = radix_tree_lookup(&mapping->page_tree, index);
+ rcu_read_unlock();
+ return item == swp_to_radix_entry(swap);
+}
+
+/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
pgoff_t index, gfp_t gfp, void *expected)
{
- int error = 0;
+ int error;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(!PageSwapBacked(page));
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = index;
+
+ spin_lock_irq(&mapping->tree_lock);
if (!expected)
- error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ error = radix_tree_insert(&mapping->page_tree, index, page);
+ else
+ error = shmem_radix_tree_replace(mapping, index, expected,
+ page);
if (!error) {
- page_cache_get(page);
- page->mapping = mapping;
- page->index = index;
-
- spin_lock_irq(&mapping->tree_lock);
- if (!expected)
- error = radix_tree_insert(&mapping->page_tree,
- index, page);
- else
- error = shmem_radix_tree_replace(mapping, index,
- expected, page);
- if (!error) {
- mapping->nrpages++;
- __inc_zone_page_state(page, NR_FILE_PAGES);
- __inc_zone_page_state(page, NR_SHMEM);
- spin_unlock_irq(&mapping->tree_lock);
- } else {
- page->mapping = NULL;
- spin_unlock_irq(&mapping->tree_lock);
- page_cache_release(page);
- }
- if (!expected)
- radix_tree_preload_end();
+ mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ __inc_zone_page_state(page, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ } else {
+ page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page);
}
- if (error)
- mem_cgroup_uncharge_cache_page(page);
return error;
}
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
mutex_lock(&shmem_swaplist_mutex);
/*
* We needed to drop mutex to make that restrictive page
- * allocation; but the inode might already be freed by now,
- * and we cannot refer to inode or mapping or info to check.
- * However, we do hold page lock on the PageSwapCache page,
- * so can check if that still has our reference remaining.
+ * allocation, but the inode might have been freed while we
+ * dropped it: although a racing shmem_evict_inode() cannot
+ * complete without emptying the radix_tree, our page lock
+ * on this swapcache page is not enough to prevent that -
+ * free_swap_and_cache() of our swap entry will only
+ * trylock_page(), removing swap from radix_tree whatever.
+ *
+ * We must not proceed to shmem_add_to_page_cache() if the
+ * inode has been freed, but of course we cannot rely on
+ * inode or mapping or info to check that. However, we can
+ * safely check if our swap entry is still in use (and here
+ * it can't have got reused for another page): if it's still
+ * in use, then the inode cannot have been freed yet, and we
+ * can safely proceed (if it's no longer in use, that tells
+ * nothing about the inode, but we don't need to unuse swap).
*/
if (!page_swapcount(*pagep))
error = -ENOENT;
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
/*
* There's a faint possibility that swap page was replaced before
- * caller locked it: it will come back later with the right page.
+ * caller locked it: caller will come back later with the right page.
*/
- if (unlikely(!PageSwapCache(page)))
+ if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
goto out;
/*
@@ -995,21 +1015,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
newpage = shmem_alloc_page(gfp, info, index);
if (!newpage)
return -ENOMEM;
- VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
- *pagep = newpage;
page_cache_get(newpage);
copy_highpage(newpage, oldpage);
+ flush_dcache_page(newpage);
- VM_BUG_ON(!PageLocked(oldpage));
__set_page_locked(newpage);
- VM_BUG_ON(!PageUptodate(oldpage));
SetPageUptodate(newpage);
- VM_BUG_ON(!PageSwapBacked(oldpage));
SetPageSwapBacked(newpage);
- VM_BUG_ON(!swap_index);
set_page_private(newpage, swap_index);
- VM_BUG_ON(!PageSwapCache(oldpage));
SetPageSwapCache(newpage);
/*
@@ -1019,13 +1033,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
spin_lock_irq(&swap_mapping->tree_lock);
error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
newpage);
- __inc_zone_page_state(newpage, NR_FILE_PAGES);
- __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ if (!error) {
+ __inc_zone_page_state(newpage, NR_FILE_PAGES);
+ __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ }
spin_unlock_irq(&swap_mapping->tree_lock);
- BUG_ON(error);
- mem_cgroup_replace_page_cache(oldpage, newpage);
- lru_cache_add_anon(newpage);
+ if (unlikely(error)) {
+ /*
+ * Is this possible? I think not, now that our callers check
+ * both PageSwapCache and page_private after getting page lock;
+ * but be defensive. Reverse old to newpage for clear and free.
+ */
+ oldpage = newpage;
+ } else {
+ mem_cgroup_replace_page_cache(oldpage, newpage);
+ lru_cache_add_anon(newpage);
+ *pagep = newpage;
+ }
ClearPageSwapCache(oldpage);
set_page_private(oldpage, 0);
@@ -1033,7 +1058,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
unlock_page(oldpage);
page_cache_release(oldpage);
page_cache_release(oldpage);
- return 0;
+ return error;
}
/*
@@ -1107,9 +1132,10 @@ repeat:
/* We have to do this with page locked to prevent races */
lock_page(page);
- if (!PageSwapCache(page) || page->mapping) {
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ !shmem_confirm_swap(mapping, index, swap)) {
error = -EEXIST; /* try again */
- goto failed;
+ goto unlock;
}
if (!PageUptodate(page)) {
error = -EIO;
@@ -1125,9 +1151,12 @@ repeat:
error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
- if (!error)
+ if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
gfp, swp_to_radix_entry(swap));
+ /* We already confirmed swap, and make no allocation */
+ VM_BUG_ON(error);
+ }
if (error)
goto failed;
@@ -1164,11 +1193,18 @@ repeat:
__set_page_locked(page);
error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
- if (!error)
- error = shmem_add_to_page_cache(page, mapping, index,
- gfp, NULL);
if (error)
goto decused;
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ if (!error) {
+ error = shmem_add_to_page_cache(page, mapping, index,
+ gfp, NULL);
+ radix_tree_preload_end();
+ }
+ if (error) {
+ mem_cgroup_uncharge_cache_page(page);
+ goto decused;
+ }
lru_cache_add_anon(page);
spin_lock(&info->lock);
@@ -1228,14 +1264,10 @@ decused:
unacct:
shmem_unacct_blocks(info->flags, 1);
failed:
- if (swap.val && error != -EINVAL) {
- struct page *test = find_get_page(mapping, index);
- if (test && !radix_tree_exceptional_entry(test))
- page_cache_release(test);
- /* Have another try if the entry has changed */
- if (test != swp_to_radix_entry(swap))
- error = -EEXIST;
- }
+ if (swap.val && error != -EINVAL &&
+ !shmem_confirm_swap(mapping, index, swap))
+ error = -EEXIST;
+unlock:
if (page) {
unlock_page(page);
page_cache_release(page);
@@ -1247,7 +1279,7 @@ failed:
spin_unlock(&info->lock);
goto repeat;
}
- if (error == -EEXIST)
+ if (error == -EEXIST) /* from above or from radix_tree_insert */
goto repeat;
return error;
}
@@ -1577,6 +1609,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &page_cache_pipe_buf_ops,
.spd_release = spd_release_page,
@@ -1665,7 +1698,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (spd.nr_pages)
error = splice_to_pipe(pipe, &spd);
- splice_shrink_spd(pipe, &spd);
+ splice_shrink_spd(&spd);
if (error > 0) {
*ppos += error;
@@ -1674,98 +1707,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
return error;
}
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
- pgoff_t index, pgoff_t end, int origin)
-{
- struct page *page;
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
- bool done = false;
- int i;
-
- pagevec_init(&pvec, 0);
- pvec.nr = 1; /* start small: we may be there already */
- while (!done) {
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- pvec.nr, pvec.pages, indices);
- if (!pvec.nr) {
- if (origin == SEEK_DATA)
- index = end;
- break;
- }
- for (i = 0; i < pvec.nr; i++, index++) {
- if (index < indices[i]) {
- if (origin == SEEK_HOLE) {
- done = true;
- break;
- }
- index = indices[i];
- }
- page = pvec.pages[i];
- if (page && !radix_tree_exceptional_entry(page)) {
- if (!PageUptodate(page))
- page = NULL;
- }
- if (index >= end ||
- (page && origin == SEEK_DATA) ||
- (!page && origin == SEEK_HOLE)) {
- done = true;
- break;
- }
- }
- shmem_deswap_pagevec(&pvec);
- pagevec_release(&pvec);
- pvec.nr = PAGEVEC_SIZE;
- cond_resched();
- }
- return index;
-}
-
-static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
-{
- struct address_space *mapping;
- struct inode *inode;
- pgoff_t start, end;
- loff_t new_offset;
-
- if (origin != SEEK_DATA && origin != SEEK_HOLE)
- return generic_file_llseek_size(file, offset, origin,
- MAX_LFS_FILESIZE);
- mapping = file->f_mapping;
- inode = mapping->host;
- mutex_lock(&inode->i_mutex);
- /* We're holding i_mutex so we can access i_size directly */
-
- if (offset < 0)
- offset = -EINVAL;
- else if (offset >= inode->i_size)
- offset = -ENXIO;
- else {
- start = offset >> PAGE_CACHE_SHIFT;
- end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- new_offset = shmem_seek_hole_data(mapping, start, end, origin);
- new_offset <<= PAGE_CACHE_SHIFT;
- if (new_offset > offset) {
- if (new_offset < inode->i_size)
- offset = new_offset;
- else if (origin == SEEK_DATA)
- offset = -ENXIO;
- else
- offset = inode->i_size;
- }
- }
-
- if (offset >= 0 && offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- mutex_unlock(&inode->i_mutex);
- return offset;
-}
-
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -1936,7 +1877,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ bool excl)
{
return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
}
@@ -2769,7 +2710,7 @@ static const struct address_space_operations shmem_aops = {
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
- .llseek = shmem_file_llseek,
+ .llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = shmem_file_aio_read,
diff --git a/mm/sparse.c b/mm/sparse.c
index 6a4bf9160e8..c7bb952400c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -275,8 +275,9 @@ static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- pg_data_t *host_pgdat;
- unsigned long goal;
+ unsigned long goal, limit;
+ unsigned long *p;
+ int nid;
/*
* A page may contain usemaps for other sections preventing the
* page being freed and making a section unremovable while
@@ -287,10 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
- goal = __pa(pgdat) & PAGE_SECTION_MASK;
- host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
- return __alloc_bootmem_node_nopanic(host_pgdat, size,
- SMP_CACHE_BYTES, goal);
+ goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ limit = goal + (1UL << PA_SECTION_SHIFT);
+ nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
+again:
+ p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+ SMP_CACHE_BYTES, goal, limit);
+ if (!p && limit) {
+ limit = 0;
+ goto again;
+ }
+ return p;
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef5..71373d03fce 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
static void free_swap_count_continuations(struct swap_info_struct *);
static sector_t map_swap_entry(swp_entry_t, struct block_device**);
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
long nr_swap_pages;
long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
swap_list.next = p->type;
nr_swap_pages++;
p->inuse_pages--;
+ frontswap_invalidate_page(p->type, offset);
if ((p->flags & SWP_BLKDEV) &&
disk->fops->swap_slot_free_notify)
disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm,
}
/*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
* Recycle to start on reaching the end, returning 0 when empty.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev)
+ unsigned int prev, bool frontswap)
{
unsigned int max = si->max;
unsigned int i = prev;
@@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
prev = 0;
i = 1;
}
+ if (frontswap) {
+ if (frontswap_test(si, i))
+ break;
+ else
+ continue;
+ }
count = si->swap_map[i];
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
@@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+ unsigned long pages_to_unuse)
{
struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
@@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
* one pass through swap_map is enough, but not necessarily:
* there are races when an instance of an entry might be missed.
*/
- while ((i = find_next_to_unuse(si, i)) != 0) {
+ while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
if (signal_pending(current)) {
retval = -EINTR;
break;
@@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type)
* interactive performance.
*/
cond_resched();
+ if (frontswap && pages_to_unuse > 0) {
+ if (!--pages_to_unuse)
+ break;
+ }
}
mmput(start_mm);
@@ -1486,7 +1504,8 @@ bad_bmap:
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map)
+ unsigned char *swap_map,
+ unsigned long *frontswap_map)
{
int i, prev;
@@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
else
p->prio = --least_priority;
p->swap_map = swap_map;
+ frontswap_map_set(p, frontswap_map);
p->flags |= SWP_WRITEOK;
nr_swap_pages += p->pages;
total_swap_pages += p->pages;
@@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
swap_list.head = swap_list.next = p->type;
else
swap_info[prev]->next = p->type;
+ frontswap_init(p->type);
spin_unlock(&swap_lock);
}
@@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
- err = try_to_unuse(type);
+ err = try_to_unuse(type, false, 0); /* force all pages to be unused */
compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
if (err) {
@@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
* sys_swapoff for this swap_info_struct at this point.
*/
/* re-insert swap space back into swap_list */
- enable_swap_info(p, p->prio, p->swap_map);
+ enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
goto out_dput;
}
@@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
+ frontswap_invalidate_area(type);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+ vfree(frontswap_map_get(p));
/* Destroy swap account informatin */
swap_cgroup_swapoff(type);
@@ -1893,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
/*
* Find out how many pages are allowed for a single swap
- * device. There are three limiting factors: 1) the number
+ * device. There are two limiting factors: 1) the number
* of bits for the swap offset in the swp_entry_t type, and
* 2) the number of bits in the swap pte as defined by the
- * the different architectures, and 3) the number of free bits
- * in an exceptional radix_tree entry. In order to find the
+ * different architectures. In order to find the
* largest possible bit mask, a swap entry with swap type 0
* and swap offset ~0UL is created, encoded to a swap pte,
* decoded to a swp_entry_t again, and finally the swap
* offset is extracted. This will mask all the bits from
* the initial ~0UL mask that can't be encoded in either
* the swp_entry_t or the architecture definition of a
- * swap pte. Then the same is done for a radix_tree entry.
+ * swap pte.
*/
maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL))));
- maxpages = swp_offset(radix_to_swp_entry(
- swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
-
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
if (maxpages > swap_header->info.last_page) {
maxpages = swap_header->info.last_page + 1;
/* p->max is an unsigned int: don't overflow it */
@@ -1988,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sector_t span;
unsigned long maxpages;
unsigned char *swap_map = NULL;
+ unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
@@ -2071,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = nr_extents;
goto bad_swap;
}
+ /* frontswap enabled? set up bit-per-page map for frontswap */
+ if (frontswap_enabled)
+ frontswap_map = vzalloc(maxpages / sizeof(long));
if (p->bdev) {
if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map);
+ enable_swap_info(p, prio, swap_map, frontswap_map);
printk(KERN_INFO "Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s\n",
+ "Priority:%d extents:%d across:%lluk %s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
- (p->flags & SWP_DISCARDABLE) ? "D" : "");
+ (p->flags & SWP_DISCARDABLE) ? "D" : "",
+ (frontswap_map) ? "FS" : "");
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2aad49981b5..e03f4c7307a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1280,7 +1280,7 @@ DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, void *caller)
+ unsigned long flags, const void *caller)
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
@@ -1306,7 +1306,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
}
static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, void *caller)
+ unsigned long flags, const void *caller)
{
setup_vmalloc_vm(vm, va, flags, caller);
insert_vmalloc_vmlist(vm);
@@ -1314,7 +1314,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
- unsigned long end, int node, gfp_t gfp_mask, void *caller)
+ unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
@@ -1375,7 +1375,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
- void *caller)
+ const void *caller)
{
return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
caller);
@@ -1397,13 +1397,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
}
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
- void *caller)
+ const void *caller)
{
return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-1, GFP_KERNEL, caller);
}
-static struct vm_struct *find_vm_area(const void *addr)
+/**
+ * find_vm_area - find a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and return it.
+ * It is up to the caller to do all required locking to keep the returned
+ * pointer valid.
+ */
+struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
@@ -1568,9 +1576,9 @@ EXPORT_SYMBOL(vmap);
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller);
+ int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node, void *caller)
+ pgprot_t prot, int node, const void *caller)
{
const int order = 0;
struct page **pages;
@@ -1643,7 +1651,7 @@ fail:
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
- pgprot_t prot, int node, void *caller)
+ pgprot_t prot, int node, const void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1699,7 +1707,7 @@ fail:
*/
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller)
+ int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
gfp_mask, prot, node, caller);
@@ -1975,9 +1983,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
* IOREMAP area is treated as memory hole and no copy is done.
*
* If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0.
- * @buf should be kernel's buffer. Because this function uses KM_USER0,
- * the caller should guarantee KM_USER0 is not used.
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
*
* Note: In usual ops, vread() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
@@ -2051,9 +2057,7 @@ finished:
* IOREMAP area is treated as memory hole and no copy is done.
*
* If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0.
- * @buf should be kernel's buffer. Because this function uses KM_USER0,
- * the caller should guarantee KM_USER0 is not used.
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
*
* Note: In usual ops, vwrite() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeb3bc9d1d3..347b3ff2a47 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1567,7 +1567,8 @@ static int vmscan_swappiness(struct scan_control *sc)
* by looking at the fraction of the pages scanned we did rotate back
* onto the active list instead of evict.
*
- * nr[0] = anon pages to scan; nr[1] = file pages to scan
+ * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
+ * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
@@ -2537,7 +2538,7 @@ loop_again:
* consider it to be no longer congested. It's
* possible there are dirty pages backed by
* congested BDIs but as pressure is relieved,
- * spectulatively avoid congestion waits
+ * speculatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
if (i <= *classzone_idx)
@@ -2688,7 +2689,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- schedule();
+
+ if (!kthread_should_stop())
+ schedule();
+
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
@@ -2955,14 +2959,17 @@ int kswapd_run(int nid)
}
/*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold lock_memory_hotplug().
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
- if (kswapd)
+ if (kswapd) {
kthread_stop(kswapd);
+ NODE_DATA(nid)->kswapd = NULL;
+ }
}
static int __init kswapd_init(void)