summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig23
-rw-r--r--mm/Kconfig.debug25
-rw-r--r--mm/Makefile9
-rw-r--r--mm/backing-dev.c99
-rw-r--r--mm/bootmem.c188
-rw-r--r--mm/cleancache.c244
-rw-r--r--mm/compaction.c135
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/failslab.c39
-rw-r--r--mm/filemap.c401
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/fremap.c6
-rw-r--r--mm/huge_memory.c150
-rw-r--r--mm/hugetlb.c89
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/init-mm.c3
-rw-r--r--mm/internal.h11
-rw-r--r--mm/kmemleak.c15
-rw-r--r--mm/ksm.c38
-rw-r--r--mm/maccess.c8
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c249
-rw-r--r--mm/memcontrol.c1573
-rw-r--r--mm/memory-failure.c182
-rw-r--r--mm/memory.c778
-rw-r--r--mm/memory_hotplug.c101
-rw-r--r--mm/mempolicy.c199
-rw-r--r--mm/migrate.c77
-rw-r--r--mm/mincore.c11
-rw-r--r--mm/mlock.c30
-rw-r--r--mm/mmap.c201
-rw-r--r--mm/mremap.c16
-rw-r--r--mm/nobootmem.c404
-rw-r--r--mm/nommu.c212
-rw-r--r--mm/oom_kill.c137
-rw-r--r--mm/page-writeback.c316
-rw-r--r--mm/page_alloc.c457
-rw-r--r--mm/page_cgroup.c245
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pagewalk.c73
-rw-r--r--mm/percpu.c19
-rw-r--r--mm/prio_tree.c1
-rw-r--r--mm/readahead.c20
-rw-r--r--mm/rmap.c440
-rw-r--r--mm/shmem.c2224
-rw-r--r--mm/slab.c187
-rw-r--r--mm/slob.c14
-rw-r--r--mm/slub.c1262
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c244
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c468
-rw-r--r--mm/thrash.c120
-rw-r--r--mm/truncate.c199
-rw-r--r--mm/util.c26
-rw-r--r--mm/vmalloc.c194
-rw-r--r--mm/vmscan.c448
-rw-r--r--mm/vmstat.c273
58 files changed, 7997 insertions, 4907 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e9c0c61f2dd..f2f1ca19ed5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -347,3 +347,26 @@ config NEED_PER_CPU_KM
depends on !SMP
bool
default y
+
+config CLEANCACHE
+ bool "Enable cleancache driver to cache clean pages if tmem is present"
+ default n
+ help
+ Cleancache can be thought of as a page-granularity victim cache
+ for clean pages that the kernel's pageframe replacement algorithm
+ (PFRA) would like to keep around, but can't since there isn't enough
+ memory. So when the PFRA "evicts" a page, it first attempts to use
+ cleancache code to put the data contained in that page into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. And when a cleancache-enabled
+ filesystem wishes to access a page in a file on disk, it first
+ checks cleancache to see if it already contains it; if it does,
+ the page is copied into the kernel and a disk access is avoided.
+ When a transcendent memory driver is available (such as zcache or
+ Xen transcendent memory), a significant I/O reduction
+ may be achieved. When none is available, all cleancache calls
+ are reduced to a single pointer-compare-against-NULL resulting
+ in a negligible performance hit.
+
+ If unsure, say Y to enable cleancache
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index af7cfb43d2f..8b1a477162d 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,27 +1,24 @@
config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
- depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
- depends on !HIBERNATION || !PPC && !SPARC
+ depends on DEBUG_KERNEL
+ depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
depends on !KMEMCHECK
+ select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
of memory corruption.
+ For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
+ fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages(). Additionally,
+ this option cannot be enabled in combination with hibernation as
+ that would result in incorrect warnings of memory corruption after
+ a resume because free pages are not saved to the suspend image.
+
config WANT_PAGE_DEBUG_FLAGS
bool
config PAGE_POISONING
- bool "Debug page memory allocations"
- depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
- depends on !HIBERNATION
- select DEBUG_PAGEALLOC
+ bool
select WANT_PAGE_DEBUG_FLAGS
- ---help---
- Fill the pages with poison patterns after free_pages() and verify
- the patterns before alloc_pages(). This results in a large slowdown,
- but helps to find certain types of memory corruption.
-
- This option cannot be enabled in combination with hibernation as
- that would result in incorrect warnings of memory corruption after
- a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 2b1b575ae71..836e4163c1b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o pagewalk.o pgtable-generic.o
-obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
@@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
$(mmu-y)
obj-y += init-mm.o
+ifdef CONFIG_NO_BOOTMEM
+ obj-y += nobootmem.o
+else
+ obj-y += bootmem.o
+endif
+
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_BOUNCE) += bounce.o
@@ -43,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 027100d3022..d6edf8d14f9 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,17 +14,11 @@
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-EXPORT_SYMBOL(default_unplug_io_fn);
-
struct backing_dev_info default_backing_dev_info = {
.name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
- .unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
@@ -51,6 +45,17 @@ static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+ if (wb1 < wb2) {
+ spin_lock(&wb1->list_lock);
+ spin_lock_nested(&wb2->list_lock, 1);
+ } else {
+ spin_lock(&wb2->list_lock);
+ spin_lock_nested(&wb1->list_lock, 1);
+ }
+}
+
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -69,38 +74,46 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
+ unsigned long nr_dirty, nr_io, nr_more_io;
struct inode *inode;
- nr_wb = nr_dirty = nr_io = nr_more_io = 0;
- spin_lock(&inode_lock);
+ nr_dirty = nr_io = nr_more_io = 0;
+ spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
- spin_unlock(&inode_lock);
+ spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
- "BdiWriteback: %8lu kB\n"
- "BdiReclaimable: %8lu kB\n"
- "BdiDirtyThresh: %8lu kB\n"
- "DirtyThresh: %8lu kB\n"
- "BackgroundThresh: %8lu kB\n"
- "b_dirty: %8lu\n"
- "b_io: %8lu\n"
- "b_more_io: %8lu\n"
- "bdi_list: %8u\n"
- "state: %8lx\n",
+ "BdiWriteback: %10lu kB\n"
+ "BdiReclaimable: %10lu kB\n"
+ "BdiDirtyThresh: %10lu kB\n"
+ "DirtyThresh: %10lu kB\n"
+ "BackgroundThresh: %10lu kB\n"
+ "BdiWritten: %10lu kB\n"
+ "BdiWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "bdi_list: %10u\n"
+ "state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_dirty, nr_io, nr_more_io,
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh),
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+ (unsigned long) K(bdi->write_bandwidth),
+ nr_dirty,
+ nr_io,
+ nr_more_io,
!list_empty(&bdi->bdi_list), bdi->state);
#undef K
@@ -255,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
return wb_has_dirty_io(&bdi->wb);
}
-static void bdi_flush_io(struct backing_dev_info *bdi)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .range_cyclic = 1,
- .nr_to_write = 1024,
- };
-
- writeback_inodes_wb(&bdi->wb, &wbc);
-}
-
/*
* kupdated() used to do this. We cannot do it from the bdi_forker_thread()
* or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -452,9 +453,10 @@ static int bdi_forker_thread(void *ptr)
if (IS_ERR(task)) {
/*
* If thread creation fails, force writeout of
- * the bdi from the thread.
+ * the bdi from the thread. Hopefully 1024 is
+ * large enough for efficient IO.
*/
- bdi_flush_io(bdi);
+ writeback_inodes_wb(&bdi->wb, 1024);
} else {
/*
* The spinlock makes sure we do not lose
@@ -511,7 +513,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);
- synchronize_rcu();
+ synchronize_rcu_expedited();
}
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -604,7 +606,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (sb->s_bdi == bdi)
- sb->s_bdi = NULL;
+ sb->s_bdi = &default_backing_dev_info;
}
spin_unlock(&sb_lock);
}
@@ -612,6 +614,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ bdi_set_min_ratio(bdi, 0);
trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
del_timer_sync(&bdi->wb.wakeup_timer);
@@ -634,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ spin_lock_init(&wb->list_lock);
setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
}
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW (100 << (20 - PAGE_SHIFT))
+
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
@@ -659,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi)
}
bdi->dirty_exceeded = 0;
+
+ bdi->bw_time_stamp = jiffies;
+ bdi->written_stamp = 0;
+
+ bdi->write_bandwidth = INIT_BW;
+ bdi->avg_write_bandwidth = INIT_BW;
+
err = prop_local_init_percpu(&bdi->completions);
if (err) {
@@ -682,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
- spin_lock(&inode_lock);
+ bdi_lock_two(&bdi->wb, dst);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_lock);
+ spin_unlock(&bdi->wb.list_lock);
+ spin_unlock(&dst->list_lock);
}
bdi_unregister(bdi);
@@ -793,7 +810,7 @@ EXPORT_SYMBOL(congestion_wait);
* jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete.
*
- * In the absense of zone congestion, cond_resched() is called to yield
+ * In the absence of zone congestion, cond_resched() is called to yield
* the processor if necessary but otherwise does not sleep.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 13b0caa9793..01d5a4b3dd0 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -23,19 +23,17 @@
#include "internal.h"
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data = {
+ .bdata = &bootmem_node_data[0]
+};
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-#ifdef CONFIG_CRASH_DUMP
-/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-#endif
-
-#ifndef CONFIG_NO_BOOTMEM
bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -146,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
min_low_pfn = start;
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}
-#endif
+
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
@@ -171,53 +169,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
}
}
-#ifdef CONFIG_NO_BOOTMEM
-static void __init __free_pages_memory(unsigned long start, unsigned long end)
-{
- int i;
- unsigned long start_aligned, end_aligned;
- int order = ilog2(BITS_PER_LONG);
-
- start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
- end_aligned = end & ~(BITS_PER_LONG - 1);
-
- if (end_aligned <= start_aligned) {
- for (i = start; i < end; i++)
- __free_pages_bootmem(pfn_to_page(i), 0);
-
- return;
- }
-
- for (i = start; i < start_aligned; i++)
- __free_pages_bootmem(pfn_to_page(i), 0);
-
- for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
- __free_pages_bootmem(pfn_to_page(i), order);
-
- for (i = end_aligned; i < end; i++)
- __free_pages_bootmem(pfn_to_page(i), 0);
-}
-
-unsigned long __init free_all_memory_core_early(int nodeid)
-{
- int i;
- u64 start, end;
- unsigned long count = 0;
- struct range *range = NULL;
- int nr_range;
-
- nr_range = get_free_all_memory_range(&range, nodeid);
-
- for (i = 0; i < nr_range; i++) {
- start = range[i].start;
- end = range[i].end;
- count += end - start;
- __free_pages_memory(start, end);
- }
-
- return count;
-}
-#else
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
int aligned;
@@ -278,7 +229,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
-#endif
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -289,12 +239,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
-#ifdef CONFIG_NO_BOOTMEM
- /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
- return 0;
-#else
return free_all_bootmem_core(pgdat->bdata);
-#endif
}
/**
@@ -304,16 +249,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
*/
unsigned long __init free_all_bootmem(void)
{
-#ifdef CONFIG_NO_BOOTMEM
- /*
- * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
- * because in some case like Node0 doesnt have RAM installed
- * low ram will be on Node1
- * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
- * will be used instead of only Node0 related
- */
- return free_all_memory_core_early(MAX_NUMNODES);
-#else
unsigned long total_pages = 0;
bootmem_data_t *bdata;
@@ -321,10 +256,8 @@ unsigned long __init free_all_bootmem(void)
total_pages += free_all_bootmem_core(bdata);
return total_pages;
-#endif
}
-#ifndef CONFIG_NO_BOOTMEM
static void __init __free(bootmem_data_t *bdata,
unsigned long sidx, unsigned long eidx)
{
@@ -419,7 +352,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
}
BUG();
}
-#endif
/**
* free_bootmem_node - mark a page range as usable
@@ -434,10 +366,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
-#ifdef CONFIG_NO_BOOTMEM
- kmemleak_free_part(__va(physaddr), size);
- memblock_x86_free_range(physaddr, physaddr + size);
-#else
unsigned long start, end;
kmemleak_free_part(__va(physaddr), size);
@@ -446,7 +374,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
end = PFN_DOWN(physaddr + size);
mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
-#endif
}
/**
@@ -460,10 +387,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
*/
void __init free_bootmem(unsigned long addr, unsigned long size)
{
-#ifdef CONFIG_NO_BOOTMEM
- kmemleak_free_part(__va(addr), size);
- memblock_x86_free_range(addr, addr + size);
-#else
unsigned long start, end;
kmemleak_free_part(__va(addr), size);
@@ -472,7 +395,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
end = PFN_DOWN(addr + size);
mark_bootmem(start, end, 0, 0);
-#endif
}
/**
@@ -489,17 +411,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size, int flags)
{
-#ifdef CONFIG_NO_BOOTMEM
- panic("no bootmem");
- return 0;
-#else
unsigned long start, end;
start = PFN_DOWN(physaddr);
end = PFN_UP(physaddr + size);
return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
-#endif
}
/**
@@ -515,20 +432,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
{
-#ifdef CONFIG_NO_BOOTMEM
- panic("no bootmem");
- return 0;
-#else
unsigned long start, end;
start = PFN_DOWN(addr);
end = PFN_UP(addr + size);
return mark_bootmem(start, end, 1, flags);
-#endif
}
-#ifndef CONFIG_NO_BOOTMEM
int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
@@ -685,33 +596,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
#endif
return NULL;
}
-#endif
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit)
{
-#ifdef CONFIG_NO_BOOTMEM
- void *ptr;
-
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
-
-restart:
-
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
-
- if (ptr)
- return ptr;
-
- if (goal != 0) {
- goal = 0;
- goto restart;
- }
-
- return NULL;
-#else
bootmem_data_t *bdata;
void *region;
@@ -737,7 +627,6 @@ restart:
}
return NULL;
-#endif
}
/**
@@ -758,10 +647,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
{
unsigned long limit = 0;
-#ifdef CONFIG_NO_BOOTMEM
- limit = -1UL;
-#endif
-
return ___alloc_bootmem_nopanic(size, align, goal, limit);
}
@@ -798,14 +683,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
{
unsigned long limit = 0;
-#ifdef CONFIG_NO_BOOTMEM
- limit = -1UL;
-#endif
-
return ___alloc_bootmem(size, align, goal, limit);
}
-#ifndef CONFIG_NO_BOOTMEM
static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
@@ -822,7 +702,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
return ___alloc_bootmem(size, align, goal, limit);
}
-#endif
/**
* __alloc_bootmem_node - allocate boot memory from a specific node
@@ -842,24 +721,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- void *ptr;
-
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-#ifdef CONFIG_NO_BOOTMEM
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, -1ULL);
- if (ptr)
- return ptr;
-
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, -1ULL);
-#else
- ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
-#endif
-
- return ptr;
+ return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
}
void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -880,13 +745,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
unsigned long new_goal;
new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
-#ifdef CONFIG_NO_BOOTMEM
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- new_goal, -1ULL);
-#else
ptr = alloc_bootmem_core(pgdat->bdata, size, align,
new_goal, 0);
-#endif
if (ptr)
return ptr;
}
@@ -907,16 +767,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
void * __init alloc_bootmem_section(unsigned long size,
unsigned long section_nr)
{
-#ifdef CONFIG_NO_BOOTMEM
- unsigned long pfn, goal, limit;
-
- pfn = section_nr_to_pfn(section_nr);
- goal = pfn << PAGE_SHIFT;
- limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-
- return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
- SMP_CACHE_BYTES, goal, limit);
-#else
bootmem_data_t *bdata;
unsigned long pfn, goal, limit;
@@ -926,7 +776,6 @@ void * __init alloc_bootmem_section(unsigned long size,
bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
-#endif
}
#endif
@@ -938,16 +787,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-#ifdef CONFIG_NO_BOOTMEM
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, -1ULL);
-#else
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
-#endif
if (ptr)
return ptr;
@@ -995,21 +839,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- void *ptr;
-
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-#ifdef CONFIG_NO_BOOTMEM
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ return ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
- if (ptr)
- return ptr;
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, ARCH_LOW_ADDRESS_LIMIT);
-#else
- ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
- goal, ARCH_LOW_ADDRESS_LIMIT);
-#endif
- return ptr;
}
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 00000000000..bcaae4c2a77
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,244 @@
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache. See
+ * Documentation/vm/cleancache.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/cleancache.h>
+
+/*
+ * This global enablement flag may be read thousands of times per second
+ * by cleancache_get/put/flush even on systems where cleancache_ops
+ * is not claimed (e.g. cleancache is config'ed on but remains
+ * disabled), so is preferred to the slower alternative: a function
+ * call that checks a non-global.
+ */
+int cleancache_enabled;
+EXPORT_SYMBOL(cleancache_enabled);
+
+/*
+ * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static struct cleancache_ops cleancache_ops;
+
+/* useful stats available in /sys/kernel/mm/cleancache */
+static unsigned long cleancache_succ_gets;
+static unsigned long cleancache_failed_gets;
+static unsigned long cleancache_puts;
+static unsigned long cleancache_flushes;
+
+/*
+ * register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+{
+ struct cleancache_ops old = cleancache_ops;
+
+ cleancache_ops = *ops;
+ cleancache_enabled = 1;
+ return old;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+ sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+ sb->cleancache_poolid =
+ (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+ struct cleancache_filekey *key)
+{
+ int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+ int len = 0, maxlen = CLEANCACHE_KEY_MAX;
+ struct super_block *sb = inode->i_sb;
+
+ key->u.ino = inode->i_ino;
+ if (sb->s_export_op != NULL) {
+ fhfn = sb->s_export_op->encode_fh;
+ if (fhfn) {
+ struct dentry d;
+ d.d_inode = inode;
+ len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+ if (len <= 0 || len == 255)
+ return -1;
+ if (maxlen > CLEANCACHE_KEY_MAX)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ */
+int __cleancache_get_page(struct page *page)
+{
+ int ret = -1;
+ int pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ VM_BUG_ON(!PageLocked(page));
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id < 0)
+ goto out;
+
+ if (cleancache_get_key(page->mapping->host, &key) < 0)
+ goto out;
+
+ ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
+ if (ret == 0)
+ cleancache_succ_gets++;
+ else
+ cleancache_failed_gets++;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index. Page must be locked. Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ */
+void __cleancache_put_page(struct page *page)
+{
+ int pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ VM_BUG_ON(!PageLocked(page));
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id >= 0 &&
+ cleancache_get_key(page->mapping->host, &key) >= 0) {
+ (*cleancache_ops.put_page)(pool_id, key, page->index, page);
+ cleancache_puts++;
+ }
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+
+/*
+ * Flush any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ */
+void __cleancache_flush_page(struct address_space *mapping, struct page *page)
+{
+ /* careful... page->mapping is NULL sometimes when this is called */
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (pool_id >= 0) {
+ VM_BUG_ON(!PageLocked(page));
+ if (cleancache_get_key(mapping->host, &key) >= 0) {
+ (*cleancache_ops.flush_page)(pool_id, key, page->index);
+ cleancache_flushes++;
+ }
+ }
+}
+EXPORT_SYMBOL(__cleancache_flush_page);
+
+/*
+ * Flush all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ */
+void __cleancache_flush_inode(struct address_space *mapping)
+{
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+ (*cleancache_ops.flush_inode)(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_flush_inode);
+
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be reutrned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs
+ */
+void __cleancache_flush_fs(struct super_block *sb)
+{
+ if (sb->cleancache_poolid >= 0) {
+ int old_poolid = sb->cleancache_poolid;
+ sb->cleancache_poolid = -1;
+ (*cleancache_ops.flush_fs)(old_poolid);
+ }
+}
+EXPORT_SYMBOL(__cleancache_flush_fs);
+
+#ifdef CONFIG_SYSFS
+
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
+
+#define CLEANCACHE_SYSFS_RO(_name) \
+ static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+ { \
+ return sprintf(buf, "%lu\n", cleancache_##_name); \
+ } \
+ static struct kobj_attribute cleancache_##_name##_attr = { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .show = cleancache_##_name##_show, \
+ }
+
+CLEANCACHE_SYSFS_RO(succ_gets);
+CLEANCACHE_SYSFS_RO(failed_gets);
+CLEANCACHE_SYSFS_RO(puts);
+CLEANCACHE_SYSFS_RO(flushes);
+
+static struct attribute *cleancache_attrs[] = {
+ &cleancache_succ_gets_attr.attr,
+ &cleancache_failed_gets_attr.attr,
+ &cleancache_puts_attr.attr,
+ &cleancache_flushes_attr.attr,
+ NULL,
+};
+
+static struct attribute_group cleancache_attr_group = {
+ .attrs = cleancache_attrs,
+ .name = "cleancache",
+};
+
+#endif /* CONFIG_SYSFS */
+
+static int __init init_cleancache(void)
+{
+#ifdef CONFIG_SYSFS
+ int err;
+
+ err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
+#endif /* CONFIG_SYSFS */
+ return 0;
+}
+module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
index 8be430b812d..6cc604bd564 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,8 +42,6 @@ struct compact_control {
unsigned int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
-
- int compact_mode;
};
static unsigned long release_freepages(struct list_head *freelist)
@@ -146,16 +144,26 @@ static void isolate_freepages(struct zone *zone,
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
+ /*
+ * Initialise the free scanner. The starting point is where we last
+ * scanned from (or the end of the zone if starting). The low point
+ * is the end of the pageblock the migration scanner is using.
+ */
pfn = cc->free_pfn;
low_pfn = cc->migrate_pfn + pageblock_nr_pages;
- high_pfn = low_pfn;
+
+ /*
+ * Take care that if the migration scanner is at the end of the zone
+ * that the free scanner does not accidentally move to the next zone
+ * in the next isolation cycle.
+ */
+ high_pfn = min(low_pfn, pfn);
/*
* Isolate free pages until enough are available to migrate the
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- spin_lock_irqsave(&zone->lock, flags);
for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
@@ -178,9 +186,19 @@ static void isolate_freepages(struct zone *zone,
if (!suitable_migration_target(page))
continue;
- /* Found a block suitable for isolating free pages from */
- isolated = isolate_freepages_block(zone, pfn, freelist);
- nr_freepages += isolated;
+ /*
+ * Found a block suitable for isolating free pages from. Now
+ * we disabled interrupts, double check things are ok and
+ * isolate the pages. This is to minimise the time IRQs
+ * are disabled
+ */
+ isolated = 0;
+ spin_lock_irqsave(&zone->lock, flags);
+ if (suitable_migration_target(page)) {
+ isolated = isolate_freepages_block(zone, pfn, freelist);
+ nr_freepages += isolated;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
/*
* Record the highest PFN we isolated pages from. When next
@@ -190,7 +208,6 @@ static void isolate_freepages(struct zone *zone,
if (isolated)
high_pfn = max(high_pfn, pfn);
}
- spin_unlock_irqrestore(&zone->lock, flags);
/* split_free_page does not map the pages */
list_for_each_entry(page, freelist, lru) {
@@ -234,11 +251,18 @@ static bool too_many_isolated(struct zone *zone)
return isolated > (inactive + active) / 2;
}
+/* possible outcome of isolate_migratepages */
+typedef enum {
+ ISOLATE_ABORT, /* Abort compaction now */
+ ISOLATE_NONE, /* No pages isolated, continue scanning */
+ ISOLATE_SUCCESS, /* Pages isolated, migrate */
+} isolate_migrate_t;
+
/*
* Isolate all pages that can be migrated from the block pointed to by
* the migrate scanner within compact_control.
*/
-static unsigned long isolate_migratepages(struct zone *zone,
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
@@ -255,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
/* Do not cross the free scanner or scan within a memory hole */
if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
cc->migrate_pfn = end_pfn;
- return 0;
+ return ISOLATE_NONE;
}
/*
@@ -264,16 +288,38 @@ static unsigned long isolate_migratepages(struct zone *zone,
* delay for some time until fewer pages are isolated
*/
while (unlikely(too_many_isolated(zone))) {
+ /* async migration should just abort */
+ if (!cc->sync)
+ return ISOLATE_ABORT;
+
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
- return 0;
+ return ISOLATE_ABORT;
}
/* Time to isolate some pages for migration */
+ cond_resched();
spin_lock_irq(&zone->lru_lock);
for (; low_pfn < end_pfn; low_pfn++) {
struct page *page;
+ bool locked = true;
+
+ /* give a chance to irqs before checking need_resched() */
+ if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ spin_unlock_irq(&zone->lru_lock);
+ locked = false;
+ }
+ if (need_resched() || spin_is_contended(&zone->lru_lock)) {
+ if (locked)
+ spin_unlock_irq(&zone->lru_lock);
+ cond_resched();
+ spin_lock_irq(&zone->lru_lock);
+ if (fatal_signal_pending(current))
+ break;
+ } else if (!locked)
+ spin_lock_irq(&zone->lru_lock);
+
if (!pfn_valid_within(low_pfn))
continue;
nr_scanned++;
@@ -334,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
- return cc->nr_migratepages;
+ return ISOLATE_SUCCESS;
}
/*
@@ -396,16 +442,6 @@ static int compact_finished(struct zone *zone,
if (cc->free_pfn <= cc->migrate_pfn)
return COMPACT_COMPLETE;
- /* Compaction run is not finished if the watermark is not met */
- if (cc->compact_mode != COMPACT_MODE_KSWAPD)
- watermark = low_wmark_pages(zone);
- else
- watermark = high_wmark_pages(zone);
- watermark += (1 << cc->order);
-
- if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
- return COMPACT_CONTINUE;
-
/*
* order == -1 is expected when compacting via
* /proc/sys/vm/compact_memory
@@ -413,13 +449,11 @@ static int compact_finished(struct zone *zone,
if (cc->order == -1)
return COMPACT_CONTINUE;
- /*
- * Generating only one page of the right order is not enough
- * for kswapd, we must continue until we're above the high
- * watermark as a pool for high order GFP_ATOMIC allocations
- * too.
- */
- if (cc->compact_mode == COMPACT_MODE_KSWAPD)
+ /* Compaction run is not finished if the watermark is not met */
+ watermark = low_wmark_pages(zone);
+ watermark += (1 << cc->order);
+
+ if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
@@ -449,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
unsigned long watermark;
/*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+ if (order == -1)
+ return COMPACT_CONTINUE;
+
+ /*
* Watermarks for order-0 must be met for compaction. Note the 2UL.
* This is because during migration, copies of pages need to be
* allocated and for a short time, the footprint is higher
@@ -458,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
return COMPACT_SKIPPED;
/*
- * order == -1 is expected when compacting via
- * /proc/sys/vm/compact_memory
- */
- if (order == -1)
- return COMPACT_CONTINUE;
-
- /*
* fragmentation index determines if allocation failures are due to
* low memory or external fragmentation
*
- * index of -1 implies allocations might succeed dependingon watermarks
+ * index of -1000 implies allocations might succeed depending on
+ * watermarks
* index towards 0 implies failure is due to lack of memory
* index towards 1000 implies failure is due to fragmentation
*
@@ -478,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
return COMPACT_SKIPPED;
- if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+ if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+ 0, 0))
return COMPACT_PARTIAL;
return COMPACT_CONTINUE;
@@ -508,12 +544,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
unsigned long nr_migrate, nr_remaining;
+ int err;
- if (!isolate_migratepages(zone, cc))
+ switch (isolate_migratepages(zone, cc)) {
+ case ISOLATE_ABORT:
+ ret = COMPACT_PARTIAL;
+ goto out;
+ case ISOLATE_NONE:
continue;
+ case ISOLATE_SUCCESS:
+ ;
+ }
nr_migrate = cc->nr_migratepages;
- migrate_pages(&cc->migratepages, compaction_alloc,
+ err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
cc->sync);
update_nr_listpages(cc);
@@ -527,13 +571,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_remaining);
/* Release LRU pages not migrated */
- if (!list_empty(&cc->migratepages)) {
+ if (err) {
putback_lru_pages(&cc->migratepages);
cc->nr_migratepages = 0;
}
}
+out:
/* Release free pages and check accounting */
cc->nr_freepages -= release_freepages(&cc->freepages);
VM_BUG_ON(cc->nr_freepages != 0);
@@ -543,8 +588,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
- bool sync,
- int compact_mode)
+ bool sync)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -553,7 +597,6 @@ unsigned long compact_zone_order(struct zone *zone,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
- .compact_mode = compact_mode,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -599,8 +642,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
nodemask) {
int status;
- status = compact_zone_order(zone, order, gfp_mask, sync,
- COMPACT_MODE_DIRECT_RECLAIM);
+ status = compact_zone_order(zone, order, gfp_mask, sync);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
@@ -631,7 +673,6 @@ static int compact_node(int nid)
.nr_freepages = 0,
.nr_migratepages = 0,
.order = -1,
- .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
};
zone = &pgdat->node_zones[zoneid];
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb4519..fbb58e34688 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool)
{
struct device *dev = pool->dev;
- dma_pool_destroy(pool);
WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+ dma_pool_destroy(pool);
}
EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/failslab.c b/mm/failslab.c
index c5f88f240dd..0dd7b8fec71 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -5,10 +5,6 @@ static struct {
struct fault_attr attr;
u32 ignore_gfp_wait;
int cache_filter;
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
- struct dentry *ignore_gfp_wait_file;
- struct dentry *cache_filter_file;
-#endif
} failslab = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
@@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab);
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init failslab_debugfs_init(void)
{
- mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
- int err;
-
- err = init_fault_attr_dentries(&failslab.attr, "failslab");
- if (err)
- return err;
- dir = failslab.attr.dentries.dir;
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
- failslab.ignore_gfp_wait_file =
- debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &failslab.ignore_gfp_wait);
+ dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
- failslab.cache_filter_file =
- debugfs_create_bool("cache-filter", mode, dir,
- &failslab.cache_filter);
+ if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_wait))
+ goto fail;
+ if (!debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter))
+ goto fail;
- if (!failslab.ignore_gfp_wait_file ||
- !failslab.cache_filter_file) {
- err = -ENOMEM;
- debugfs_remove(failslab.cache_filter_file);
- debugfs_remove(failslab.ignore_gfp_wait_file);
- cleanup_fault_attr_dentries(&failslab.attr);
- }
+ return 0;
+fail:
+ debugfs_remove_recursive(dir);
- return err;
+ return -ENOMEM;
}
late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d35468..645a080ba4d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,7 +33,7 @@
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
-#include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
#include "internal.h"
/*
@@ -58,16 +58,16 @@
/*
* Lock ordering:
*
- * ->i_mmap_lock (truncate_pagecache)
+ * ->i_mmap_mutex (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
*
* ->i_mutex
- * ->i_mmap_lock (truncate->unmap_mapping_range)
+ * ->i_mmap_mutex (truncate->unmap_mapping_range)
*
* ->mmap_sem
- * ->i_mmap_lock
+ * ->i_mmap_mutex
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
@@ -77,14 +77,11 @@
* ->i_mutex (generic_file_buffered_write)
* ->mmap_sem (fault_in_pages_readable->do_page_fault)
*
- * ->i_mutex
- * ->i_alloc_sem (various)
- *
- * ->inode_lock
- * ->sb_lock (fs/fs-writeback.c)
+ * bdi->wb.list_lock
+ * sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
- * ->i_mmap_lock
+ * ->i_mmap_mutex
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
@@ -98,26 +95,39 @@
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (zap_pte_range->set_page_dirty)
+ * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
+ * ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
+ * ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
* (code doesn't rely on that order, so you could switch it around)
* ->tasklist_lock (memory_failure, collect_procs_ao)
- * ->i_mmap_lock
+ * ->i_mmap_mutex
*/
/*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold the mapping's tree_lock.
*/
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ /*
+ * if we're uptodate, flush out into the cleancache, otherwise
+ * invalidate any existing cleancache entries. We can't leave
+ * stale data around in the cleancache once our page is gone
+ */
+ if (PageUptodate(page) && PageMappedToDisk(page))
+ cleancache_put_page(page);
+ else
+ cleancache_flush_page(mapping, page);
+
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
@@ -137,7 +147,15 @@ void __remove_from_page_cache(struct page *page)
}
}
-void remove_from_page_cache(struct page *page)
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked. It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
void (*freepage)(struct page *);
@@ -146,54 +164,25 @@ void remove_from_page_cache(struct page *page)
freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
- __remove_from_page_cache(page);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
if (freepage)
freepage(page);
+ page_cache_release(page);
}
-EXPORT_SYMBOL(remove_from_page_cache);
+EXPORT_SYMBOL(delete_from_page_cache);
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
{
- struct address_space *mapping;
- struct page *page;
-
- page = container_of((unsigned long *)word, struct page, flags);
-
- /*
- * page_mapping() is being called without PG_locked held.
- * Some knowledge of the state and use of the page is used to
- * reduce the requirements down to a memory barrier.
- * The danger here is of a stale page_mapping() return value
- * indicating a struct address_space different from the one it's
- * associated with when it is associated with one.
- * After smp_mb(), it's either the correct page_mapping() for
- * the page, or an old page_mapping() and the page's own
- * page_mapping() has gone NULL.
- * The ->sync_page() address_space operation must tolerate
- * page_mapping() going NULL. By an amazing coincidence,
- * this comes about because none of the users of the page
- * in the ->sync_page() methods make essential use of the
- * page_mapping(), merely passing the page down to the backing
- * device's unplug functions when it's non-NULL, which in turn
- * ignore it for all cases but swap, where only page_private(page) is
- * of interest. When page_mapping() does go NULL, the entire
- * call stack gracefully ignores the page and returns.
- * -- wli
- */
- smp_mb();
- mapping = page_mapping(page);
- if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
- mapping->a_ops->sync_page(page);
io_schedule();
return 0;
}
-static int sync_page_killable(void *word)
+static int sleep_on_page_killable(void *word)
{
- sync_page(word);
+ sleep_on_page(word);
return fatal_signal_pending(current) ? -EINTR : 0;
}
@@ -387,6 +376,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
EXPORT_SYMBOL(filemap_write_and_wait_range);
/**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old: page to be replaced
+ * @new: page to replace with
+ * @gfp_mask: allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page. Both the old and new pages must be
+ * locked. This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic. The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+ int error;
+ struct mem_cgroup *memcg = NULL;
+
+ VM_BUG_ON(!PageLocked(old));
+ VM_BUG_ON(!PageLocked(new));
+ VM_BUG_ON(new->mapping);
+
+ /*
+ * This is not page migration, but prepare_migration and
+ * end_migration does enough work for charge replacement.
+ *
+ * In the longer term we probably want a specialized function
+ * for moving the charge from old to new in a more efficient
+ * manner.
+ */
+ error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
+ if (error)
+ return error;
+
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (!error) {
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *);
+
+ pgoff_t offset = old->index;
+ freepage = mapping->a_ops->freepage;
+
+ page_cache_get(new);
+ new->mapping = mapping;
+ new->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(old);
+ error = radix_tree_insert(&mapping->page_tree, offset, new);
+ BUG_ON(error);
+ mapping->nrpages++;
+ __inc_zone_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(new))
+ __inc_zone_page_state(new, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ if (freepage)
+ freepage(old);
+ page_cache_release(old);
+ mem_cgroup_end_migration(memcg, old, new, true);
+ } else {
+ mem_cgroup_end_migration(memcg, old, new, false);
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
+/**
* add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
* @mapping: the page's address_space
@@ -402,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
int error;
VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageSwapBacked(page));
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
@@ -419,11 +479,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
- if (PageSwapBacked(page))
- __inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
+ /* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
@@ -441,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
{
int ret;
- /*
- * Splice_read and readahead add shmem/tmpfs pages into the page cache
- * before shmem_readpage has a chance to mark them as SwapBacked: they
- * need to go on the anon lru below, and mem_cgroup_cache_charge
- * (called in add_to_page_cache) needs to know where they're going too.
- */
- if (mapping_cap_swap_backed(mapping))
- SetPageSwapBacked(page);
-
ret = add_to_page_cache(page, mapping, offset, gfp_mask);
- if (ret == 0) {
- if (page_is_file_cache(page))
- lru_cache_add_file(page);
- else
- lru_cache_add_anon(page);
- }
+ if (ret == 0)
+ lru_cache_add_file(page);
return ret;
}
EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -479,12 +525,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
EXPORT_SYMBOL(__page_cache_alloc);
#endif
-static int __sleep_on_page_lock(void *word)
-{
- io_schedule();
- return 0;
-}
-
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
@@ -512,11 +552,22 @@ void wait_on_page_bit(struct page *page, int bit_nr)
DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
if (test_bit(bit_nr, &page->flags))
- __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_on_page_bit);
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+ if (!test_bit(bit_nr, &page->flags))
+ return 0;
+
+ return __wait_on_bit(page_waitqueue(page), &wait,
+ sleep_on_page_killable, TASK_KILLABLE);
+}
+
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
* @page: Page defining the wait queue of interest
@@ -576,17 +627,12 @@ EXPORT_SYMBOL(end_page_writeback);
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait. However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
*/
void __lock_page(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
@@ -596,34 +642,39 @@ int __lock_page_killable(struct page *page)
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
return __wait_on_bit_lock(page_waitqueue(page), &wait,
- sync_page_killable, TASK_KILLABLE);
+ sleep_on_page_killable, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
-/**
- * __lock_page_nosync - get a lock on the page, without calling sync_page()
- * @page: the page to lock
- *
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void __lock_page_nosync(struct page *page)
-{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
- TASK_UNINTERRUPTIBLE);
-}
-
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
- if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
- __lock_page(page);
- return 1;
- } else {
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ /*
+ * CAUTION! In this case, mmap_sem is not released
+ * even though return 0.
+ */
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
+
up_read(&mm->mmap_sem);
- wait_on_page_locked(page);
+ if (flags & FAULT_FLAG_KILLABLE)
+ wait_on_page_locked_killable(page);
+ else
+ wait_on_page_locked(page);
return 0;
+ } else {
+ if (flags & FAULT_FLAG_KILLABLE) {
+ int ret;
+
+ ret = __lock_page_killable(page);
+ if (ret) {
+ up_read(&mm->mmap_sem);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
}
}
@@ -648,9 +699,16 @@ repeat:
page = radix_tree_deref_slot(pagep);
if (unlikely(!page))
goto out;
- if (radix_tree_deref_retry(page))
- goto repeat;
-
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto repeat;
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so return it without
+ * attempting to raise page count.
+ */
+ goto out;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -687,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
repeat:
page = find_get_page(mapping, offset);
- if (page) {
+ if (page && !radix_tree_exception(page)) {
lock_page(page);
/* Has the page been truncated? */
if (unlikely(page->mapping != mapping)) {
@@ -774,7 +832,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
rcu_read_lock();
restart:
nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
- (void ***)pages, start, nr_pages);
+ (void ***)pages, NULL, start, nr_pages);
ret = 0;
for (i = 0; i < nr_found; i++) {
struct page *page;
@@ -782,10 +840,23 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- if (radix_tree_deref_retry(page)) {
- if (ret)
- start = pages[ret-1]->index;
- goto restart;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ WARN_ON(start | i);
+ goto restart;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so skip over it -
+ * we only reach this from invalidate_mapping_pages().
+ */
+ continue;
}
if (!page_cache_get_speculative(page))
@@ -800,6 +871,13 @@ repeat:
pages[ret] = page;
ret++;
}
+
+ /*
+ * If all entries were removed before we could secure them,
+ * try again, because callers stop trying once 0 is returned.
+ */
+ if (unlikely(!ret && nr_found))
+ goto restart;
rcu_read_unlock();
return ret;
}
@@ -826,7 +904,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
rcu_read_lock();
restart:
nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
- (void ***)pages, index, nr_pages);
+ (void ***)pages, NULL, index, nr_pages);
ret = 0;
for (i = 0; i < nr_found; i++) {
struct page *page;
@@ -834,8 +912,23 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- if (radix_tree_deref_retry(page))
- goto restart;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so stop looking for
+ * contiguous pages.
+ */
+ break;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -894,8 +987,22 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- if (radix_tree_deref_retry(page))
- goto restart;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * This function is never used on a shmem/tmpfs
+ * mapping, so a swap entry won't be found here.
+ */
+ BUG();
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -909,6 +1016,13 @@ repeat:
pages[ret] = page;
ret++;
}
+
+ /*
+ * If all entries were removed before we could secure them,
+ * try again, because callers stop trying once 0 is returned.
+ */
+ if (unlikely(!ret && nr_found))
+ goto restart;
rcu_read_unlock();
if (ret)
@@ -1298,12 +1412,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
+ struct blk_plug plug;
count = 0;
retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
if (retval)
return retval;
+ blk_start_plug(&plug);
+
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
loff_t size;
@@ -1376,6 +1493,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
break;
}
out:
+ blk_finish_plug(&plug);
return retval;
}
EXPORT_SYMBOL(generic_file_aio_read);
@@ -1468,15 +1586,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
return;
+ if (!ra->ra_pages)
+ return;
- if (VM_SequentialReadHint(vma) ||
- offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
+ if (VM_SequentialReadHint(vma)) {
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
return;
}
- if (ra->mmap_miss < INT_MAX)
+ /* Avoid banging the cache line if not needed */
+ if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
ra->mmap_miss++;
/*
@@ -1490,12 +1610,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
* mmap read-around
*/
ra_pages = max_sane_readahead(ra->ra_pages);
- if (ra_pages) {
- ra->start = max_t(long, 0, offset - ra_pages/2);
- ra->size = ra_pages;
- ra->async_size = 0;
- ra_submit(ra, mapping, file);
- }
+ ra->start = max_t(long, 0, offset - ra_pages / 2);
+ ra->size = ra_pages;
+ ra->async_size = ra_pages / 4;
+ ra_submit(ra, mapping, file);
}
/*
@@ -1562,6 +1680,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_get_page(mapping, offset);
@@ -1600,7 +1719,6 @@ retry_find:
return VM_FAULT_SIGBUS;
}
- ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
@@ -1696,7 +1814,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data,
gfp_t gfp)
{
@@ -1727,7 +1845,7 @@ repeat:
static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data,
gfp_t gfp)
@@ -1767,7 +1885,7 @@ out:
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
- * @data: destination for read data
+ * @data: first arg to filler(data, page) function, often left as NULL
*
* Same as read_cache_page, but don't wait for page to become unlocked
* after submitting it to the filler.
@@ -1779,7 +1897,7 @@ out:
*/
struct page *read_cache_page_async(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -1827,7 +1945,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
- * @data: destination for read data
+ * @data: first arg to filler(data, page) function, often left as NULL
*
* Read into the page cache. If a page already exists, and PageUptodate() is
* not set, try to fill the page then wait for it to become unlocked.
@@ -1836,7 +1954,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
*/
struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data)
{
return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
@@ -1883,16 +2001,26 @@ static int __remove_suid(struct dentry *dentry, int kill)
int file_remove_suid(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
- int killsuid = should_remove_suid(dentry);
- int killpriv = security_inode_need_killpriv(dentry);
+ struct inode *inode = dentry->d_inode;
+ int killsuid;
+ int killpriv;
int error = 0;
+ /* Fast path for nothing security related */
+ if (IS_NOSEC(inode))
+ return 0;
+
+ killsuid = should_remove_suid(dentry);
+ killpriv = security_inode_need_killpriv(dentry);
+
if (killpriv < 0)
return killpriv;
if (killpriv)
error = security_inode_killpriv(dentry);
if (!error && killsuid)
error = __remove_suid(dentry, killsuid);
+ if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+ inode->i_flags |= S_NOSEC;
return error;
}
@@ -2228,7 +2356,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
repeat:
page = find_lock_page(mapping, index);
if (page)
- return page;
+ goto found;
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
if (!page)
@@ -2241,6 +2369,8 @@ repeat:
goto repeat;
return NULL;
}
+found:
+ wait_on_page_writeback(page);
return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2487,11 +2617,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ struct blk_plug plug;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
+ blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
@@ -2502,6 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0 && ret > 0)
ret = err;
}
+ blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 83364df74a3..93356cd1282 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
return;
retry:
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
@@ -201,7 +201,7 @@ retry:
page_cache_release(page);
}
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
if (locked) {
mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index ec520c7b28d..b8e0e2d468a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -211,20 +211,20 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
goto out;
}
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
vma_prio_tree_remove(vma, &mapping->i_mmap);
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
if (vma->vm_flags & VM_LOCKED) {
/*
* drop PG_Mlocked flag for over-mapped range
*/
- unsigned int saved_flags = vma->vm_flags;
+ vm_flags_t saved_flags = vma->vm_flags;
munlock_vma_pages_range(vma, start, start + size);
vma->vm_flags = saved_flags;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3e29781ee76..e2d1587be26 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -244,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf,
enum transparent_hugepage_flag flag)
{
- if (test_bit(flag, &transparent_hugepage_flags))
- return sprintf(buf, "[yes] no\n");
- else
- return sprintf(buf, "yes [no]\n");
+ return sprintf(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
}
+
static ssize_t single_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag flag)
{
- if (!memcmp("yes", buf,
- min(sizeof("yes")-1, count))) {
+ unsigned long value;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value)
set_bit(flag, &transparent_hugepage_flags);
- } else if (!memcmp("no", buf,
- min(sizeof("no")-1, count))) {
+ else
clear_bit(flag, &transparent_hugepage_flags);
- } else
- return -EINVAL;
return count;
}
@@ -643,23 +647,24 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
return ret;
}
-static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
{
- return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+ return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
}
static inline struct page *alloc_hugepage_vma(int defrag,
struct vm_area_struct *vma,
- unsigned long haddr)
+ unsigned long haddr, int nd,
+ gfp_t extra_gfp)
{
- return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
- HPAGE_PMD_ORDER, vma, haddr);
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
+ HPAGE_PMD_ORDER, vma, haddr, nd);
}
#ifndef CONFIG_NUMA
static inline struct page *alloc_hugepage(int defrag)
{
- return alloc_pages(alloc_hugepage_gfpmask(defrag),
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
HPAGE_PMD_ORDER);
}
#endif
@@ -678,9 +683,12 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr);
- if (unlikely(!page))
+ vma, haddr, numa_node_id(), 0);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
goto out;
+ }
+ count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
put_page(page);
goto out;
@@ -799,8 +807,9 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
- vma, address);
+ pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
+ __GFP_OTHER_NODE,
+ vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_newpage_charge(pages[i], mm,
GFP_KERNEL))) {
@@ -902,16 +911,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr);
+ vma, haddr, numa_node_id(), 0);
else
new_page = NULL;
if (unlikely(!new_page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
put_page(page);
goto out;
}
+ count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
@@ -1128,7 +1139,7 @@ static int __split_huge_page_splitting(struct page *page,
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->lock to
+ * and it won't wait on the anon_vma->root->mutex to
* serialize against split_huge_page*.
*/
pmdp_splitting_flush_notify(vma, address, pmd);
@@ -1322,7 +1333,7 @@ static int __split_huge_page_map(struct page *page,
return ret;
}
-/* must be called with anon_vma->root->lock hold */
+/* must be called with anon_vma->root->mutex hold */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
@@ -1388,6 +1399,7 @@ int split_huge_page(struct page *page)
BUG_ON(!PageSwapBacked(page));
__split_huge_page(page, anon_vma);
+ count_vm_event(THP_SPLIT);
BUG_ON(PageCompound(page));
out_unlock:
@@ -1396,6 +1408,9 @@ out:
return ret;
}
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+ VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
@@ -1404,11 +1419,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
/*
* Be somewhat over-protective like KSM for now!
*/
- if (*vm_flags & (VM_HUGEPAGE |
- VM_SHARED | VM_MAYSHARE |
- VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_MIXEDMAP | VM_SAO))
+ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
@@ -1424,11 +1435,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
/*
* Be somewhat over-protective like KSM for now!
*/
- if (*vm_flags & (VM_NOHUGEPAGE |
- VM_SHARED | VM_MAYSHARE |
- VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_MIXEDMAP | VM_SAO))
+ if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
return -EINVAL;
*vm_flags &= ~VM_HUGEPAGE;
*vm_flags |= VM_NOHUGEPAGE;
@@ -1562,10 +1569,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
* page fault if needed.
*/
return 0;
- if (vma->vm_file || vma->vm_ops)
+ if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -1585,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm)
list_del(&mm_slot->mm_node);
free = 1;
}
+ spin_unlock(&khugepaged_mm_lock);
if (free) {
- spin_unlock(&khugepaged_mm_lock);
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
free_mm_slot(mm_slot);
mmdrop(mm);
} else if (mm_slot) {
- spin_unlock(&khugepaged_mm_lock);
/*
* This is required to serialize against
* khugepaged_test_exit() (which is guaranteed to run
@@ -1603,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm)
*/
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
- } else
- spin_unlock(&khugepaged_mm_lock);
+ }
}
static void release_pte_page(struct page *page)
@@ -1745,7 +1754,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma,
+ int node)
{
pgd_t *pgd;
pud_t *pud;
@@ -1759,6 +1769,7 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
#ifndef CONFIG_NUMA
+ up_read(&mm->mmap_sem);
VM_BUG_ON(!*hpage);
new_page = *hpage;
#else
@@ -1773,22 +1784,29 @@ static void collapse_huge_page(struct mm_struct *mm,
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+ node, __GFP_OTHER_NODE);
+
+ /*
+ * After allocating the hugepage, release the mmap_sem read lock in
+ * preparation for taking it in write mode.
+ */
+ up_read(&mm->mmap_sem);
if (unlikely(!new_page)) {
- up_read(&mm->mmap_sem);
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
return;
}
#endif
+
+ count_vm_event(THP_COLLAPSE_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
- up_read(&mm->mmap_sem);
+#ifdef CONFIG_NUMA
put_page(new_page);
+#endif
return;
}
- /* after allocating the hugepage upgrade to mmap_sem write mode */
- up_read(&mm->mmap_sem);
-
/*
* Prevent all access to pagetables with the exception of
* gup_fast later hanlded by the ptep_clear_flush and the VM
@@ -1808,12 +1826,15 @@ static void collapse_huge_page(struct mm_struct *mm,
(vma->vm_flags & VM_NOHUGEPAGE))
goto out;
- /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
- if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ if (!vma->anon_vma || vma->vm_ops)
goto out;
if (is_vma_temporary_stack(vma))
goto out;
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -1919,6 +1940,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
struct page *page;
unsigned long _address;
spinlock_t *ptl;
+ int node = -1;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1949,6 +1971,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
page = vm_normal_page(vma, _address, pteval);
if (unlikely(!page))
goto out_unmap;
+ /*
+ * Chose the node of the first page. This could
+ * be more sophisticated and look at more pages,
+ * but isn't for now.
+ */
+ if (node == -1)
+ node = page_to_nid(page);
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
@@ -1965,7 +1994,7 @@ out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret)
/* collapse_huge_page will return with the mmap_sem released */
- collapse_huge_page(mm, address, hpage, vma);
+ collapse_huge_page(mm, address, hpage, vma, node);
out:
return ret;
}
@@ -2038,13 +2067,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
progress++;
continue;
}
- /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
- if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ if (!vma->anon_vma || vma->vm_ops)
goto skip;
if (is_vma_temporary_stack(vma))
goto skip;
-
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping()
+ * must be true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) ||
+ vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2135,8 +2167,11 @@ static void khugepaged_do_scan(struct page **hpage)
#ifndef CONFIG_NUMA
if (!*hpage) {
*hpage = alloc_hugepage(khugepaged_defrag());
- if (unlikely(!*hpage))
+ if (unlikely(!*hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
break;
+ }
+ count_vm_event(THP_COLLAPSE_ALLOC);
}
#else
if (IS_ERR(*hpage))
@@ -2176,8 +2211,11 @@ static struct page *khugepaged_alloc_hugepage(void)
do {
hpage = alloc_hugepage(khugepaged_defrag());
- if (!hpage)
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
} while (unlikely(!hpage) &&
likely(khugepaged_enabled()));
return hpage;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bb0b7c12801..dae27ba3be2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,7 +24,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
-#include <asm/io.h>
+#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/node.h>
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock);
* must either hold the mmap_sem for write, or the mmap_sem for read and
* the hugetlb_instantiation mutex:
*
- * down_write(&mm->mmap_sem);
+ * down_write(&mm->mmap_sem);
* or
- * down_read(&mm->mmap_sem);
- * mutex_lock(&hugetlb_instantiation_mutex);
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
*/
struct file_region {
struct list_head link;
@@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t)
if (rg->from > t)
return chg;
- /* We overlap with this area, if it extends futher than
+ /* We overlap with this area, if it extends further than
* us then we must extend ourselves. Account for its
* existing reservation. */
if (rg->to > t) {
@@ -475,7 +475,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
/* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
- goto err;;
+ goto err;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page)
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
- 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1<< PG_writeback);
+ page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_private | 1 << PG_writeback);
}
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
@@ -591,7 +592,6 @@ int PageHuge(struct page *page)
return dtor == free_huge_page;
}
-
EXPORT_SYMBOL_GPL(PageHuge);
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
@@ -842,7 +842,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
}
/*
- * Increase the hugetlb pool such that it can accomodate a reservation
+ * Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
*/
static int gather_surplus_pages(struct hstate *h, int delta)
@@ -890,7 +890,7 @@ retry:
/*
* The surplus_list now contains _at_least_ the number of extra pages
- * needed to accomodate the reservation. Add the appropriate number
+ * needed to accommodate the reservation. Add the appropriate number
* of pages to the hugetlb pool and free the extras back to the buddy
* allocator. Commit the entire reservation here to prevent another
* process from stealing the pages as they are added to the pool but
@@ -1033,10 +1033,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
- return ERR_PTR(chg);
+ return ERR_PTR(-VM_FAULT_OOM);
if (chg)
if (hugetlb_get_quota(inode->i_mapping, chg))
- return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-VM_FAULT_SIGBUS);
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
@@ -1105,12 +1105,28 @@ static void __init gather_bootmem_prealloc(void)
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
- struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+ struct page *page;
+
+#ifdef CONFIG_HIGHMEM
+ page = pfn_to_page(m->phys >> PAGE_SHIFT);
+ free_bootmem_late((unsigned long)m,
+ sizeof(struct huge_bootmem_page));
+#else
+ page = virt_to_page(m);
+#endif
__ClearPageReserved(page);
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
prep_new_huge_page(h, page, page_to_nid(page));
+ /*
+ * If we had gigantic hugepages allocated at boot time, we need
+ * to restore the 'stolen' pages to totalram_pages in order to
+ * fix confusing memory reports from free(1) and another
+ * side-effects, like CommitLimit going negative.
+ */
+ if (h->order > (MAX_ORDER - 1))
+ totalram_pages += 1 << h->order;
}
}
@@ -1872,8 +1888,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
unsigned long tmp;
int ret;
- if (!write)
- tmp = h->max_huge_pages;
+ tmp = h->max_huge_pages;
if (write && h->order >= MAX_ORDER)
return -EINVAL;
@@ -1938,8 +1953,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
unsigned long tmp;
int ret;
- if (!write)
- tmp = h->nr_overcommit_huge_pages;
+ tmp = h->nr_overcommit_huge_pages;
if (write && h->order >= MAX_ORDER)
return -EINVAL;
@@ -2045,7 +2059,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
- * has a reference to the reservation map it cannot dissappear until
+ * has a reference to the reservation map it cannot disappear until
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
@@ -2118,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
pte_t entry;
entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
- if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
- }
}
@@ -2175,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ if (non_swap_entry(swp) && is_migration_entry(swp))
return 1;
- } else
+ else
return 0;
}
@@ -2188,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
return 1;
- } else
+ else
return 0;
}
@@ -2207,7 +2220,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long sz = huge_page_size(h);
/*
- * A page gathering list, protected by per file i_mmap_lock. The
+ * A page gathering list, protected by per file i_mmap_mutex. The
* lock is used to avoid list corruption from multiple unmapping
* of the same page since we are using page->lru.
*/
@@ -2276,9 +2289,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
__unmap_hugepage_range(vma, start, end, ref_page);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
/*
@@ -2310,7 +2323,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* this mapping should be shared between all the VMAs,
* __unmap_hugepage_range() is called as the lock is already held
*/
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
@@ -2328,7 +2341,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
address, address + huge_page_size(h),
page);
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return 1;
}
@@ -2492,7 +2505,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Currently, we are forced to kill the process in the event the
* original mapper has unmapped pages from the child due to a failed
- * COW. Warn that such a situation has occured as it may not be obvious
+ * COW. Warn that such a situation has occurred as it may not be obvious
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
printk(KERN_WARNING
@@ -2553,7 +2566,7 @@ retry:
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON |
+ ret = VM_FAULT_HWPOISON |
VM_FAULT_SET_HINDEX(h - hstates);
goto backout_unlocked;
}
@@ -2621,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
migration_entry_wait(mm, (pmd_t *)ptep, address);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
+ return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(h - hstates);
}
@@ -2812,7 +2825,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
- spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
spin_lock(&mm->page_table_lock);
for (; address < end; address += huge_page_size(h)) {
ptep = huge_pte_offset(mm, address);
@@ -2827,7 +2840,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
}
}
spin_unlock(&mm->page_table_lock);
- spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
flush_tlb_range(vma, start, end);
}
@@ -2835,7 +2848,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
int hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
- int acctflag)
+ vm_flags_t vm_flags)
{
long ret, chg;
struct hstate *h = hstate_inode(inode);
@@ -2845,7 +2858,7 @@ int hugetlb_reserve_pages(struct inode *inode,
* attempt will be made for VM_NORESERVE to allocate a page
* and filesystem quota without using reserves
*/
- if (acctflag & VM_NORESERVE)
+ if (vm_flags & VM_NORESERVE)
return 0;
/*
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 0948f1072d6..c7fc7fd00e3 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -1,4 +1,4 @@
-/* Inject a hwpoison memory failure on a arbitary pfn */
+/* Inject a hwpoison memory failure on a arbitrary pfn */
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 1d29cdfe8eb..a56a851908d 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,7 +5,7 @@
#include <linux/list.h>
#include <linux/cpumask.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
@@ -21,6 +21,5 @@ struct mm_struct init_mm = {
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
- .cpu_vm_mask = CPU_MASK_ALL,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index 69488205723..d071d380fb4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -66,6 +66,10 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
+/* mm/util.c */
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node *rb_parent);
+
#ifdef CONFIG_MMU
extern long mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
@@ -162,7 +166,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset)
}
/*
- * Iterator over all subpages withing the maximally aligned gigantic
+ * Iterator over all subpages within the maximally aligned gigantic
* page 'base'. Handle any discontiguity in the mem_map.
*/
static inline struct page *mem_map_next(struct page *iter,
@@ -245,11 +249,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int len, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas,
- int *nonblocking);
-
#define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 84225f3b719..d6880f542f9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -96,7 +96,7 @@
#include <asm/sections.h>
#include <asm/processor.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/kmemcheck.h>
#include <linux/kmemleak.h>
@@ -265,7 +265,7 @@ static void kmemleak_disable(void);
} while (0)
/*
- * Macro invoked when a serious kmemleak condition occured and cannot be
+ * Macro invoked when a serious kmemleak condition occurred and cannot be
* recovered from. Kmemleak will be disabled and further allocation/freeing
* tracing no longer available.
*/
@@ -1006,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object)
/*
* Memory scanning is a long process and it needs to be interruptable. This
- * function checks whether such interrupt condition occured.
+ * function checks whether such interrupt condition occurred.
*/
static int scan_should_stop(void)
{
@@ -1414,9 +1414,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++(*pos);
list_for_each_continue_rcu(n, &object_list) {
- next_obj = list_entry(n, struct kmemleak_object, object_list);
- if (get_object(next_obj))
+ struct kmemleak_object *obj =
+ list_entry(n, struct kmemleak_object, object_list);
+ if (get_object(obj)) {
+ next_obj = obj;
break;
+ }
}
put_object(prev_obj);
@@ -1733,7 +1736,7 @@ static int __init kmemleak_late_init(void)
if (atomic_read(&kmemleak_error)) {
/*
- * Some error occured and kmemleak was disabled. There is a
+ * Some error occurred and kmemleak was disabled. There is a
* small chance that kmemleak_disable() was called immediately
* after setting kmemleak_initialized and we may end up with
* two clean-up threads but serialized by scan_mutex.
diff --git a/mm/ksm.c b/mm/ksm.c
index c2b2a94f9d6..9a68b0cf0a1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -35,6 +35,7 @@
#include <linux/ksm.h>
#include <linux/hash.h>
#include <linux/freezer.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -301,20 +302,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
return rmap_item->address & STABLE_FLAG;
}
-static void hold_anon_vma(struct rmap_item *rmap_item,
- struct anon_vma *anon_vma)
-{
- rmap_item->anon_vma = anon_vma;
- get_anon_vma(anon_vma);
-}
-
-static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
-{
- struct anon_vma *anon_vma = rmap_item->anon_vma;
-
- drop_anon_vma(anon_vma);
-}
-
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
@@ -397,7 +384,7 @@ static void break_cow(struct rmap_item *rmap_item)
* It is not an accident that whenever we want to break COW
* to undo, we also need to drop a reference to the anon_vma.
*/
- ksm_drop_anon_vma(rmap_item);
+ put_anon_vma(rmap_item->anon_vma);
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
@@ -466,7 +453,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
ksm_pages_sharing--;
else
ksm_pages_shared--;
- ksm_drop_anon_vma(rmap_item);
+ put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
@@ -554,7 +541,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
else
ksm_pages_shared--;
- ksm_drop_anon_vma(rmap_item);
+ put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -734,7 +721,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
swapped = PageSwapCache(page);
flush_cache_page(vma, addr, page_to_pfn(page));
/*
- * Ok this is tricky, when get_user_pages_fast() run it doesnt
+ * Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
* with the pagecount against the mapcount is racey and
* O_DIRECT can happen right after the check.
@@ -949,7 +936,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
goto out;
/* Must get reference to anon_vma while still holding mmap_sem */
- hold_anon_vma(rmap_item, vma->anon_vma);
+ rmap_item->anon_vma = vma->anon_vma;
+ get_anon_vma(vma->anon_vma);
out:
up_read(&mm->mmap_sem);
return err;
@@ -1314,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
ksm_scan.mm_slot = slot;
spin_unlock(&ksm_mmlist_lock);
+ /*
+ * Although we tested list_empty() above, a racing __ksm_exit
+ * of the last mm on the list may have removed it since then.
+ */
+ if (slot == &ksm_mm_head)
+ return NULL;
next_mm:
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
@@ -1907,9 +1901,11 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
if (ksm_run != flags) {
ksm_run = flags;
if (flags & KSM_RUN_UNMERGE) {
- current->flags |= PF_OOM_ORIGIN;
+ int oom_score_adj;
+
+ oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
err = unmerge_and_remove_all_rmap_items();
- current->flags &= ~PF_OOM_ORIGIN;
+ test_set_oom_score_adj(oom_score_adj);
if (err) {
ksm_run = KSM_RUN_STOP;
count = err;
diff --git a/mm/maccess.c b/mm/maccess.c
index e2b6f5634e0..4cee182ab5f 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -15,10 +15,10 @@
* happens, handle that and return -EFAULT.
*/
-long __weak probe_kernel_read(void *dst, void *src, size_t size)
+long __weak probe_kernel_read(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read")));
-long __probe_kernel_read(void *dst, void *src, size_t size)
+long __probe_kernel_read(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
@@ -43,10 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-long __weak probe_kernel_write(void *dst, void *src, size_t size)
+long __weak probe_kernel_write(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_write")));
-long __probe_kernel_write(void *dst, void *src, size_t size)
+long __probe_kernel_write(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed50..74bf193eff0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
endoff = (loff_t)(end - vma->vm_start - 1)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+ /* vmtruncate_range needs to take i_mutex */
up_read(&current->mm->mmap_sem);
error = vmtruncate_range(mapping->host, offset, endoff);
down_read(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index 4618fda975a..ccbf9733959 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -58,28 +58,6 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
-static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
- phys_addr_t base2, phys_addr_t size2)
-{
- if (base2 == base1 + size1)
- return 1;
- else if (base1 == base2 + size2)
- return -1;
-
- return 0;
-}
-
-static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
- unsigned long r1, unsigned long r2)
-{
- phys_addr_t base1 = type->regions[r1].base;
- phys_addr_t size1 = type->regions[r1].size;
- phys_addr_t base2 = type->regions[r2].base;
- phys_addr_t size2 = type->regions[r2].size;
-
- return memblock_addrs_adjacent(base1, size1, base2, size2);
-}
-
long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
{
unsigned long i;
@@ -206,14 +184,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
type->regions[i].size = type->regions[i + 1].size;
}
type->cnt--;
-}
-/* Assumption: base addr of region 1 < base addr of region 2 */
-static void __init_memblock memblock_coalesce_regions(struct memblock_type *type,
- unsigned long r1, unsigned long r2)
-{
- type->regions[r1].size += type->regions[r2].size;
- memblock_remove_region(type, r2);
+ /* Special case for empty arrays */
+ if (type->cnt == 0) {
+ type->cnt = 1;
+ type->regions[0].base = 0;
+ type->regions[0].size = 0;
+ }
}
/* Defined below but needed now */
@@ -276,7 +253,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
return 0;
/* Add the new reserved region now. Should not fail ! */
- BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0);
+ BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size));
/* If the array wasn't our static init one, then free it. We only do
* that before SLAB is available as later on, we don't know whether
@@ -296,58 +273,99 @@ extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1
return 1;
}
-static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+static long __init_memblock memblock_add_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
{
- unsigned long coalesced = 0;
- long adjacent, i;
-
- if ((type->cnt == 1) && (type->regions[0].size == 0)) {
- type->regions[0].base = base;
- type->regions[0].size = size;
- return 0;
- }
+ phys_addr_t end = base + size;
+ int i, slot = -1;
- /* First try and coalesce this MEMBLOCK with another. */
+ /* First try and coalesce this MEMBLOCK with others */
for (i = 0; i < type->cnt; i++) {
- phys_addr_t rgnbase = type->regions[i].base;
- phys_addr_t rgnsize = type->regions[i].size;
+ struct memblock_region *rgn = &type->regions[i];
+ phys_addr_t rend = rgn->base + rgn->size;
+
+ /* Exit if there's no possible hits */
+ if (rgn->base > end || rgn->size == 0)
+ break;
- if ((rgnbase == base) && (rgnsize == size))
- /* Already have this region, so we're done */
+ /* Check if we are fully enclosed within an existing
+ * block
+ */
+ if (rgn->base <= base && rend >= end)
return 0;
- adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
- /* Check if arch allows coalescing */
- if (adjacent != 0 && type == &memblock.memory &&
- !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize))
- break;
- if (adjacent > 0) {
- type->regions[i].base -= size;
- type->regions[i].size += size;
- coalesced++;
- break;
- } else if (adjacent < 0) {
- type->regions[i].size += size;
- coalesced++;
- break;
+ /* Check if we overlap or are adjacent with the bottom
+ * of a block.
+ */
+ if (base < rgn->base && end >= rgn->base) {
+ /* If we can't coalesce, create a new block */
+ if (!memblock_memory_can_coalesce(base, size,
+ rgn->base,
+ rgn->size)) {
+ /* Overlap & can't coalesce are mutually
+ * exclusive, if you do that, be prepared
+ * for trouble
+ */
+ WARN_ON(end != rgn->base);
+ goto new_block;
+ }
+ /* We extend the bottom of the block down to our
+ * base
+ */
+ rgn->base = base;
+ rgn->size = rend - base;
+
+ /* Return if we have nothing else to allocate
+ * (fully coalesced)
+ */
+ if (rend >= end)
+ return 0;
+
+ /* We continue processing from the end of the
+ * coalesced block.
+ */
+ base = rend;
+ size = end - base;
+ }
+
+ /* Now check if we overlap or are adjacent with the
+ * top of a block
+ */
+ if (base <= rend && end >= rend) {
+ /* If we can't coalesce, create a new block */
+ if (!memblock_memory_can_coalesce(rgn->base,
+ rgn->size,
+ base, size)) {
+ /* Overlap & can't coalesce are mutually
+ * exclusive, if you do that, be prepared
+ * for trouble
+ */
+ WARN_ON(rend != base);
+ goto new_block;
+ }
+ /* We adjust our base down to enclose the
+ * original block and destroy it. It will be
+ * part of our new allocation. Since we've
+ * freed an entry, we know we won't fail
+ * to allocate one later, so we won't risk
+ * losing the original block allocation.
+ */
+ size += (base - rgn->base);
+ base = rgn->base;
+ memblock_remove_region(type, i--);
}
}
- /* If we plugged a hole, we may want to also coalesce with the
- * next region
+ /* If the array is empty, special case, replace the fake
+ * filler region and return
*/
- if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) &&
- ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base,
- type->regions[i].size,
- type->regions[i+1].base,
- type->regions[i+1].size)))) {
- memblock_coalesce_regions(type, i, i+1);
- coalesced++;
+ if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
+ return 0;
}
- if (coalesced)
- return coalesced;
-
+ new_block:
/* If we are out of space, we fail. It's too late to resize the array
* but then this shouldn't have happened in the first place.
*/
@@ -362,13 +380,14 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys
} else {
type->regions[i+1].base = base;
type->regions[i+1].size = size;
+ slot = i + 1;
break;
}
}
-
if (base < type->regions[0].base) {
type->regions[0].base = base;
type->regions[0].size = size;
+ slot = 0;
}
type->cnt++;
@@ -376,7 +395,8 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys
* our allocation and return an error
*/
if (type->cnt == type->max && memblock_double_array(type)) {
- type->cnt--;
+ BUG_ON(slot < 0);
+ memblock_remove_region(type, slot);
return -1;
}
@@ -389,52 +409,55 @@ long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
}
-static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+static long __init_memblock __memblock_remove(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
{
- phys_addr_t rgnbegin, rgnend;
phys_addr_t end = base + size;
int i;
- rgnbegin = rgnend = 0; /* supress gcc warnings */
-
- /* Find the region where (base, size) belongs to */
- for (i=0; i < type->cnt; i++) {
- rgnbegin = type->regions[i].base;
- rgnend = rgnbegin + type->regions[i].size;
+ /* Walk through the array for collisions */
+ for (i = 0; i < type->cnt; i++) {
+ struct memblock_region *rgn = &type->regions[i];
+ phys_addr_t rend = rgn->base + rgn->size;
- if ((rgnbegin <= base) && (end <= rgnend))
+ /* Nothing more to do, exit */
+ if (rgn->base > end || rgn->size == 0)
break;
- }
- /* Didn't find the region */
- if (i == type->cnt)
- return -1;
+ /* If we fully enclose the block, drop it */
+ if (base <= rgn->base && end >= rend) {
+ memblock_remove_region(type, i--);
+ continue;
+ }
- /* Check to see if we are removing entire region */
- if ((rgnbegin == base) && (rgnend == end)) {
- memblock_remove_region(type, i);
- return 0;
- }
+ /* If we are fully enclosed within a block
+ * then we need to split it and we are done
+ */
+ if (base > rgn->base && end < rend) {
+ rgn->size = base - rgn->base;
+ if (!memblock_add_region(type, end, rend - end))
+ return 0;
+ /* Failure to split is bad, we at least
+ * restore the block before erroring
+ */
+ rgn->size = rend - rgn->base;
+ WARN_ON(1);
+ return -1;
+ }
- /* Check to see if region is matching at the front */
- if (rgnbegin == base) {
- type->regions[i].base = end;
- type->regions[i].size -= size;
- return 0;
- }
+ /* Check if we need to trim the bottom of a block */
+ if (rgn->base < end && rend > end) {
+ rgn->size -= end - rgn->base;
+ rgn->base = end;
+ break;
+ }
- /* Check to see if the region is matching at the end */
- if (rgnend == end) {
- type->regions[i].size -= size;
- return 0;
- }
+ /* And check if we need to trim the top of a block */
+ if (base < rend)
+ rgn->size -= rend - base;
- /*
- * We need to split the entry - adjust the current one to the
- * beginging of the hole and add the region after hole.
- */
- type->regions[i].size = base - type->regions[i].base;
- return memblock_add_region(type, end, rgnend - end);
+ }
+ return 0;
}
long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
@@ -467,7 +490,7 @@ phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, ph
found = memblock_find_base(size, align, 0, max_addr);
if (found != MEMBLOCK_ERROR &&
- memblock_add_region(&memblock.reserved, found, size) >= 0)
+ !memblock_add_region(&memblock.reserved, found, size))
return found;
return 0;
@@ -548,7 +571,7 @@ static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
if (this_nid == nid) {
phys_addr_t ret = memblock_find_region(start, this_end, size, align);
if (ret != MEMBLOCK_ERROR &&
- memblock_add_region(&memblock.reserved, ret, size) >= 0)
+ !memblock_add_region(&memblock.reserved, ret, size))
return ret;
}
start = this_end;
@@ -735,9 +758,9 @@ void __init memblock_analyze(void)
/* Check marker in the unused last array entry */
WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
- != (phys_addr_t)RED_INACTIVE);
+ != MEMBLOCK_INACTIVE);
WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
- != (phys_addr_t)RED_INACTIVE);
+ != MEMBLOCK_INACTIVE);
memblock.memory_size = 0;
@@ -763,8 +786,8 @@ void __init memblock_init(void)
memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
/* Write a marker in the unused last array entry */
- memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
- memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+ memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
+ memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
/* Create a dummy zero size MEMBLOCK which will get coalesced away later.
* This simplifies the memblock_add() code below...
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index da53a252b25..930de943727 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0;
#define do_swap_account (0)
#endif
-/*
- * Per memcg event counter is incremented at every pagein/pageout. This counter
- * is used for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- *
- * These values will be used as !((event) & ((1 <<(thresh)) - 1))
- */
-#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
-#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
/*
* Statistics for memory cgroup.
@@ -93,19 +84,40 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
- MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
- MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
- /* incremented at every pagein/pageout */
- MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
-
MEM_CGROUP_STAT_NSTATS,
};
+enum mem_cgroup_events_index {
+ MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
+ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
+ MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
+ MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
+ MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
+ MEM_CGROUP_EVENTS_NSTATS,
+};
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_TARGET_NUMAINFO,
+ MEM_CGROUP_NTARGETS,
+};
+#define THRESHOLDS_EVENTS_TARGET (128)
+#define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET (1024)
+
struct mem_cgroup_stat_cpu {
- s64 count[MEM_CGROUP_STAT_NSTATS];
+ long count[MEM_CGROUP_STAT_NSTATS];
+ unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+ unsigned long targets[MEM_CGROUP_NTARGETS];
};
/*
@@ -192,6 +204,50 @@ struct mem_cgroup_eventfd_list {
static void mem_cgroup_threshold(struct mem_cgroup *mem);
static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
+enum {
+ SCAN_BY_LIMIT,
+ SCAN_BY_SYSTEM,
+ NR_SCAN_CONTEXT,
+ SCAN_BY_SHRINK, /* not recorded now */
+};
+
+enum {
+ SCAN,
+ SCAN_ANON,
+ SCAN_FILE,
+ ROTATE,
+ ROTATE_ANON,
+ ROTATE_FILE,
+ FREED,
+ FREED_ANON,
+ FREED_FILE,
+ ELAPSED,
+ NR_SCANSTATS,
+};
+
+struct scanstat {
+ spinlock_t lock;
+ unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
+ unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
+};
+
+const char *scanstat_string[NR_SCANSTATS] = {
+ "scanned_pages",
+ "scanned_anon_pages",
+ "scanned_file_pages",
+ "rotated_pages",
+ "rotated_anon_pages",
+ "rotated_file_pages",
+ "freed_pages",
+ "freed_anon_pages",
+ "freed_file_pages",
+ "elapsed_ns",
+};
+#define SCANSTAT_WORD_LIMIT "_by_limit"
+#define SCANSTAT_WORD_SYSTEM "_by_system"
+#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"
+
+
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
@@ -218,25 +274,28 @@ struct mem_cgroup {
* per zone LRU lists.
*/
struct mem_cgroup_lru_info info;
-
- /*
- protect against reclaim related member.
- */
- spinlock_t reclaim_param_lock;
-
/*
* While reclaiming in a hierarchy, we cache the last child we
* reclaimed from.
*/
int last_scanned_child;
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ atomic_t numainfo_events;
+ atomic_t numainfo_updating;
+#endif
/*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- atomic_t oom_lock;
+
+ bool oom_lock;
+ atomic_t under_oom;
+
atomic_t refcnt;
- unsigned int swappiness;
+ int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
@@ -254,7 +313,8 @@ struct mem_cgroup {
/* For oom notifier event fd */
struct list_head oom_notify;
-
+ /* For recording LRU-scan statistics */
+ struct scanstat scanstat;
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
@@ -327,13 +387,6 @@ enum charge_type {
NR_CHARGE_TYPE,
};
-/* only for here (for easy reading.) */
-#define PCGF_CACHE (1UL << PCG_CACHE)
-#define PCGF_USED (1UL << PCG_USED)
-#define PCGF_LOCK (1UL << PCG_LOCK)
-/* Not used, but added here for completeness */
-#define PCGF_ACCT (1UL << PCG_ACCT)
-
/* for encoding cft->private value on file */
#define _MEM (0)
#define _MEMSWAP (1)
@@ -357,7 +410,7 @@ enum charge_type {
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(void);
+static void drain_all_stock_async(struct mem_cgroup *mem);
static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -371,14 +424,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
}
static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct page_cgroup *pc)
+page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
{
- struct mem_cgroup *mem = pc->mem_cgroup;
- int nid = page_cgroup_nid(pc);
- int zid = page_cgroup_zid(pc);
-
- if (!mem)
- return NULL;
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
return mem_cgroup_zoneinfo(mem, nid, zid);
}
@@ -504,11 +553,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
}
}
-static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
-{
- return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
-}
-
static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
@@ -565,11 +609,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
* common workload, threashold and synchonization as vmstat[] should be
* implemented.
*/
-static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
- enum mem_cgroup_stat_index idx)
+static long mem_cgroup_read_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
{
+ long val = 0;
int cpu;
- s64 val = 0;
get_online_cpus();
for_each_online_cpu(cpu)
@@ -583,15 +627,6 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
return val;
}
-static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
- s64 ret;
-
- ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
- ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
- return ret;
-}
-
static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
bool charge)
{
@@ -599,6 +634,32 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
}
+void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
+{
+ this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
+}
+
+void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
+{
+ this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
+}
+
+static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
+ enum mem_cgroup_events_index idx)
+{
+ unsigned long val = 0;
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ val += per_cpu(mem->stat->events[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+ spin_lock(&mem->pcp_counter_lock);
+ val += mem->nocpu_base.events[idx];
+ spin_unlock(&mem->pcp_counter_lock);
+#endif
+ return val;
+}
+
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
bool file, int nr_pages)
{
@@ -611,39 +672,89 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
+ __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
else {
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+ __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
+ __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
preempt_enable();
}
-static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
- enum lru_list idx)
+unsigned long
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
+ unsigned int lru_mask)
{
- int nid, zid;
struct mem_cgroup_per_zone *mz;
+ enum lru_list l;
+ unsigned long ret = 0;
+
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+
+ for_each_lru(l) {
+ if (BIT(l) & lru_mask)
+ ret += MEM_CGROUP_ZSTAT(mz, l);
+ }
+ return ret;
+}
+
+static unsigned long
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
+ int nid, unsigned int lru_mask)
+{
u64 total = 0;
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++)
+ total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
- for_each_online_node(nid)
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
- total += MEM_CGROUP_ZSTAT(mz, idx);
- }
return total;
}
-static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
+ unsigned int lru_mask)
{
- s64 val;
+ int nid;
+ u64 total = 0;
+
+ for_each_node_state(nid, N_HIGH_MEMORY)
+ total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
+ return total;
+}
+
+static bool __memcg_event_check(struct mem_cgroup *mem, int target)
+{
+ unsigned long val, next;
+
+ val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+ next = this_cpu_read(mem->stat->targets[target]);
+ /* from time_after() in jiffies.h */
+ return ((long)next - (long)val < 0);
+}
+
+static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
+{
+ unsigned long val, next;
+
+ val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
- val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
+ switch (target) {
+ case MEM_CGROUP_TARGET_THRESH:
+ next = val + THRESHOLDS_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_SOFTLIMIT:
+ next = val + SOFTLIMIT_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_NUMAINFO:
+ next = val + NUMAINFO_EVENTS_TARGET;
+ break;
+ default:
+ return;
+ }
- return !(val & ((1 << event_mask_shift) - 1));
+ this_cpu_write(mem->stat->targets[target], next);
}
/*
@@ -653,10 +764,23 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
{
/* threshold event is triggered in finer grain than soft limit */
- if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
+ if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
mem_cgroup_threshold(mem);
- if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
+ __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
+ if (unlikely(__memcg_event_check(mem,
+ MEM_CGROUP_TARGET_SOFTLIMIT))) {
mem_cgroup_update_tree(mem, page);
+ __mem_cgroup_target_update(mem,
+ MEM_CGROUP_TARGET_SOFTLIMIT);
+ }
+#if MAX_NUMNODES > 1
+ if (unlikely(__memcg_event_check(mem,
+ MEM_CGROUP_TARGET_NUMAINFO))) {
+ atomic_inc(&mem->numainfo_events);
+ __mem_cgroup_target_update(mem,
+ MEM_CGROUP_TARGET_NUMAINFO);
+ }
+#endif
}
}
@@ -681,7 +805,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
struct mem_cgroup, css);
}
-static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *mem = NULL;
@@ -785,6 +909,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
return (mem == root_mem_cgroup);
}
+void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
+{
+ struct mem_cgroup *mem;
+
+ if (!mm)
+ return;
+
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem))
+ goto out;
+
+ switch (idx) {
+ case PGMAJFAULT:
+ mem_cgroup_pgmajfault(mem, 1);
+ break;
+ case PGFAULT:
+ mem_cgroup_pgfault(mem, 1);
+ break;
+ default:
+ BUG();
+ }
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(mem_cgroup_count_vm_event);
+
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
@@ -815,7 +966,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
* We don't check PCG_USED bit. It's cleared when the "page" is finally
* removed from global LRU.
*/
- mz = page_cgroup_zoneinfo(pc);
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
/* huge page split is done under lru_lock. so, we have no races. */
MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
if (mem_cgroup_is_root(pc->mem_cgroup))
@@ -829,6 +980,32 @@ void mem_cgroup_del_lru(struct page *page)
mem_cgroup_del_lru_list(page, page_lru(page));
}
+/*
+ * Writeback is about to end against a page which has been marked for immediate
+ * reclaim. If it still appears to be reclaimable, move it to the tail of the
+ * inactive list.
+ */
+void mem_cgroup_rotate_reclaimable_page(struct page *page)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct page_cgroup *pc;
+ enum lru_list lru = page_lru(page);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ pc = lookup_page_cgroup(page);
+ /* unused or root page is not rotated. */
+ if (!PageCgroupUsed(pc))
+ return;
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
+ if (mem_cgroup_is_root(pc->mem_cgroup))
+ return;
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
+ list_move_tail(&pc->lru, &mz->lists[lru]);
+}
+
void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
@@ -845,7 +1022,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
smp_rmb();
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
- mz = page_cgroup_zoneinfo(pc);
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
list_move(&pc->lru, &mz->lists[lru]);
}
@@ -862,7 +1039,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
return;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
- mz = page_cgroup_zoneinfo(pc);
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
/* huge page split is done under lru_lock. so, we have no races. */
MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
SetPageCgroupAcctLRU(pc);
@@ -872,18 +1049,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
}
/*
- * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
- * lru because the page may.be reused after it's fully uncharged (because of
- * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
- * it again. This function is only used to charge SwapCache. It's done under
- * lock_page and expected that zone->lru_lock is never held.
+ * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
+ * while it's linked to lru because the page may be reused after it's fully
+ * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
+ * It's done under lock_page and expected that zone->lru_lock isnever held.
*/
-static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
+static void mem_cgroup_lru_del_before_commit(struct page *page)
{
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
+ /*
+ * Doing this check without taking ->lru_lock seems wrong but this
+ * is safe. Because if page_cgroup's USED bit is unset, the page
+ * will not be added to any memcg's LRU. If page_cgroup's USED bit is
+ * set, the commit after this will fail, anyway.
+ * This all charge/uncharge is done under some mutual execustion.
+ * So, we don't need to taking care of changes in USED bit.
+ */
+ if (likely(!PageLRU(page)))
+ return;
+
spin_lock_irqsave(&zone->lru_lock, flags);
/*
* Forget old LRU when this page_cgroup is *not* used. This Used bit
@@ -894,12 +1081,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
-static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
+static void mem_cgroup_lru_add_after_commit(struct page *page)
{
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
+ /* taking care of that the page is added to LRU while we commit it */
+ if (likely(!PageLRU(page)))
+ return;
spin_lock_irqsave(&zone->lru_lock, flags);
/* link when the page is linked to LRU but page_cgroup isn't */
if (PageLRU(page) && !PageCgroupAcctLRU(pc))
@@ -917,6 +1107,21 @@ void mem_cgroup_move_lists(struct page *page,
mem_cgroup_add_lru_list(page, to);
}
+/*
+ * Checks whether given mem is same or in the root_mem's
+ * hierarchy subtree
+ */
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
+ struct mem_cgroup *mem)
+{
+ if (root_mem != mem) {
+ return (root_mem->use_hierarchy &&
+ css_is_ancestor(&mem->css, &root_mem->css));
+ }
+
+ return true;
+}
+
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
int ret;
@@ -936,10 +1141,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
* enabled in "curr" and "curr" is a child of "mem" in *cgroup*
* hierarchy(even if use_hierarchy is disabled in "mem").
*/
- if (mem->use_hierarchy)
- ret = css_is_ancestor(&curr->css, &mem->css);
- else
- ret = (curr == mem);
+ ret = mem_cgroup_same_or_subtree(mem, curr);
css_put(&curr->css);
return ret;
}
@@ -951,8 +1153,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
unsigned long gb;
unsigned long inactive_ratio;
- inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
- active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
+ inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
+ active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -991,23 +1193,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
unsigned long active;
unsigned long inactive;
- inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
- active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
+ inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
+ active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
return (active > inactive);
}
-unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
- struct zone *zone,
- enum lru_list lru)
-{
- int nid = zone_to_nid(zone);
- int zid = zone_idx(zone);
- struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-
- return MEM_CGROUP_ZSTAT(mz, lru);
-}
-
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
@@ -1032,10 +1223,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
return NULL;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
- mz = page_cgroup_zoneinfo(pc);
- if (!mz)
- return NULL;
-
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
return &mz->reclaim_stat;
}
@@ -1067,9 +1255,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
if (scan >= nr_to_scan)
break;
- page = pc->page;
if (unlikely(!PageCgroupUsed(pc)))
continue;
+
+ page = lookup_cgroup_page(pc);
+
if (unlikely(!PageLRU(page)))
continue;
@@ -1101,49 +1291,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
-static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
-{
- if (do_swap_account) {
- if (res_counter_check_under_limit(&mem->res) &&
- res_counter_check_under_limit(&mem->memsw))
- return true;
- } else
- if (res_counter_check_under_limit(&mem->res))
- return true;
- return false;
-}
-
/**
- * mem_cgroup_check_margin - check if the memory cgroup allows charging
- * @mem: memory cgroup to check
- * @bytes: the number of bytes the caller intends to charge
+ * mem_cgroup_margin - calculate chargeable space of a memory cgroup
+ * @mem: the memory cgroup
*
- * Returns a boolean value on whether @mem can be charged @bytes or
- * whether this would exceed the limit.
+ * Returns the maximum amount of memory @mem can be charged with, in
+ * pages.
*/
-static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
{
- if (!res_counter_check_margin(&mem->res, bytes))
- return false;
- if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
- return false;
- return true;
+ unsigned long long margin;
+
+ margin = res_counter_margin(&mem->res);
+ if (do_swap_account)
+ margin = min(margin, res_counter_margin(&mem->memsw));
+ return margin >> PAGE_SHIFT;
}
-static unsigned int get_swappiness(struct mem_cgroup *memcg)
+int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
- unsigned int swappiness;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;
- spin_lock(&memcg->reclaim_param_lock);
- swappiness = memcg->swappiness;
- spin_unlock(&memcg->reclaim_param_lock);
-
- return swappiness;
+ return memcg->swappiness;
}
static void mem_cgroup_start_move(struct mem_cgroup *mem)
@@ -1207,10 +1380,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
to = mc.to;
if (!from)
goto unlock;
- if (from == mem || to == mem
- || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
- || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
- ret = true;
+
+ ret = mem_cgroup_same_or_subtree(mem, from)
+ || mem_cgroup_same_or_subtree(mem, to);
unlock:
spin_unlock(&mc.lock);
return ret;
@@ -1359,18 +1531,191 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
rcu_read_unlock();
/* Updates scanning parameter */
- spin_lock(&root_mem->reclaim_param_lock);
if (!css) {
/* this means start scan from ID:1 */
root_mem->last_scanned_child = 0;
} else
root_mem->last_scanned_child = found;
- spin_unlock(&root_mem->reclaim_param_lock);
}
return ret;
}
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+ int nid, bool noswap)
+{
+ if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
+ return true;
+ if (noswap || !total_swap_pages)
+ return false;
+ if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
+ return true;
+ return false;
+
+}
+#if MAX_NUMNODES > 1
+
+/*
+ * Always updating the nodemask is not very good - even if we have an empty
+ * list or the wrong list here, we can start from some node and traverse all
+ * nodes based on the zonelist. So update the list loosely once per 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+ int nid;
+ /*
+ * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+ * pagein/pageout changes since the last update.
+ */
+ if (!atomic_read(&mem->numainfo_events))
+ return;
+ if (atomic_inc_return(&mem->numainfo_updating) > 1)
+ return;
+
+ /* make a nodemask where this memcg uses memory from */
+ mem->scan_nodes = node_states[N_HIGH_MEMORY];
+
+ for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+
+ if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
+ node_clear(nid, mem->scan_nodes);
+ }
+
+ atomic_set(&mem->numainfo_events, 0);
+ atomic_set(&mem->numainfo_updating, 0);
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ int node;
+
+ mem_cgroup_may_update_nodemask(mem);
+ node = mem->last_scanned_node;
+
+ node = next_node(node, mem->scan_nodes);
+ if (node == MAX_NUMNODES)
+ node = first_node(mem->scan_nodes);
+ /*
+ * We call this when we hit limit, not when pages are added to LRU.
+ * No LRU may hold pages because all pages are UNEVICTABLE or
+ * memcg is too small and all pages are not on LRU. In that case,
+ * we use curret node.
+ */
+ if (unlikely(node == MAX_NUMNODES))
+ node = numa_node_id();
+
+ mem->last_scanned_node = node;
+ return node;
+}
+
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+ int nid;
+
+ /*
+ * quick check...making use of scan_node.
+ * We can skip unused nodes.
+ */
+ if (!nodes_empty(mem->scan_nodes)) {
+ for (nid = first_node(mem->scan_nodes);
+ nid < MAX_NUMNODES;
+ nid = next_node(nid, mem->scan_nodes)) {
+
+ if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ return true;
+ }
+ }
+ /*
+ * Check rest of nodes.
+ */
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ if (node_isset(nid, mem->scan_nodes))
+ continue;
+ if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ return true;
+ }
+ return false;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+ return 0;
+}
+
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+ return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
+#endif
+
+static void __mem_cgroup_record_scanstat(unsigned long *stats,
+ struct memcg_scanrecord *rec)
+{
+
+ stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
+ stats[SCAN_ANON] += rec->nr_scanned[0];
+ stats[SCAN_FILE] += rec->nr_scanned[1];
+
+ stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
+ stats[ROTATE_ANON] += rec->nr_rotated[0];
+ stats[ROTATE_FILE] += rec->nr_rotated[1];
+
+ stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
+ stats[FREED_ANON] += rec->nr_freed[0];
+ stats[FREED_FILE] += rec->nr_freed[1];
+
+ stats[ELAPSED] += rec->elapsed;
+}
+
+static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
+{
+ struct mem_cgroup *mem;
+ int context = rec->context;
+
+ if (context >= NR_SCAN_CONTEXT)
+ return;
+
+ mem = rec->mem;
+ spin_lock(&mem->scanstat.lock);
+ __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
+ spin_unlock(&mem->scanstat.lock);
+
+ mem = rec->root;
+ spin_lock(&mem->scanstat.lock);
+ __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
+ spin_unlock(&mem->scanstat.lock);
+}
+
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1386,7 +1731,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
struct zone *zone,
gfp_t gfp_mask,
- unsigned long reclaim_options)
+ unsigned long reclaim_options,
+ unsigned long *total_scanned)
{
struct mem_cgroup *victim;
int ret, total = 0;
@@ -1394,18 +1740,37 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
- unsigned long excess = mem_cgroup_get_excess(root_mem);
+ struct memcg_scanrecord rec;
+ unsigned long excess;
+ unsigned long scanned;
+
+ excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
/* If memsw_is_minimum==1, swap-out is of-no-use. */
- if (root_mem->memsw_is_minimum)
+ if (!check_soft && !shrink && root_mem->memsw_is_minimum)
noswap = true;
+ if (shrink)
+ rec.context = SCAN_BY_SHRINK;
+ else if (check_soft)
+ rec.context = SCAN_BY_SYSTEM;
+ else
+ rec.context = SCAN_BY_LIMIT;
+
+ rec.root = root_mem;
+
while (1) {
victim = mem_cgroup_select_victim(root_mem);
if (victim == root_mem) {
loop++;
- if (loop >= 1)
- drain_all_stock_async();
+ /*
+ * We are not draining per cpu cached charges during
+ * soft limit reclaim because global reclaim doesn't
+ * care about charges. It tries to free some memory and
+ * charges will not give any.
+ */
+ if (!check_soft && loop >= 1)
+ drain_all_stock_async(root_mem);
if (loop >= 2) {
/*
* If we have not been able to reclaim
@@ -1417,7 +1782,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
break;
}
/*
- * We want to do more targetted reclaim.
+ * We want to do more targeted reclaim.
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
@@ -1429,18 +1794,28 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
}
}
}
- if (!mem_cgroup_local_usage(victim)) {
+ if (!mem_cgroup_reclaimable(victim, noswap)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
continue;
}
+ rec.mem = victim;
+ rec.nr_scanned[0] = 0;
+ rec.nr_scanned[1] = 0;
+ rec.nr_rotated[0] = 0;
+ rec.nr_rotated[1] = 0;
+ rec.nr_freed[0] = 0;
+ rec.nr_freed[1] = 0;
+ rec.elapsed = 0;
/* we use swappiness of local cgroup */
- if (check_soft)
+ if (check_soft) {
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
- noswap, get_swappiness(victim), zone);
- else
+ noswap, zone, &rec, &scanned);
+ *total_scanned += scanned;
+ } else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
- noswap, get_swappiness(victim));
+ noswap, &rec);
+ mem_cgroup_record_scanstat(&rec);
css_put(&victim->css);
/*
* At shrinking usage, we can't check we should stop here or
@@ -1451,10 +1826,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
return ret;
total += ret;
if (check_soft) {
- if (res_counter_check_under_soft_limit(&root_mem->res))
+ if (!res_counter_soft_limit_excess(&root_mem->res))
return total;
- } else if (mem_cgroup_check_under_limit(root_mem))
- return 1 + total;
+ } else if (mem_cgroup_margin(root_mem))
+ return total;
}
return total;
}
@@ -1462,38 +1837,84 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
+ * Has to be called with memcg_oom_lock
*/
static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
- int x, lock_count = 0;
- struct mem_cgroup *iter;
+ int lock_count = -1;
+ struct mem_cgroup *iter, *failed = NULL;
+ bool cond = true;
- for_each_mem_cgroup_tree(iter, mem) {
- x = atomic_inc_return(&iter->oom_lock);
- lock_count = max(x, lock_count);
+ for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+ bool locked = iter->oom_lock;
+
+ iter->oom_lock = true;
+ if (lock_count == -1)
+ lock_count = iter->oom_lock;
+ else if (lock_count != locked) {
+ /*
+ * this subtree of our hierarchy is already locked
+ * so we cannot give a lock.
+ */
+ lock_count = 0;
+ failed = iter;
+ cond = false;
+ }
}
- if (lock_count == 1)
- return true;
- return false;
+ if (!failed)
+ goto done;
+
+ /*
+ * OK, we failed to lock the whole subtree so we have to clean up
+ * what we set up to the failing subtree
+ */
+ cond = true;
+ for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+ if (iter == failed) {
+ cond = false;
+ continue;
+ }
+ iter->oom_lock = false;
+ }
+done:
+ return lock_count;
}
+/*
+ * Has to be called with memcg_oom_lock
+ */
static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
{
struct mem_cgroup *iter;
+ for_each_mem_cgroup_tree(iter, mem)
+ iter->oom_lock = false;
+ return 0;
+}
+
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ atomic_inc(&iter->under_oom);
+}
+
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *iter;
+
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, mem)
- atomic_add_unless(&iter->oom_lock, -1, 0);
- return 0;
+ atomic_add_unless(&iter->under_oom, -1, 0);
}
-
-static DEFINE_MUTEX(memcg_oom_mutex);
+static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
@@ -1504,25 +1925,20 @@ struct oom_wait_info {
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
- struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
+ struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
+ *oom_wait_mem;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+ oom_wait_mem = oom_wait_info->mem;
- if (oom_wait_info->mem == wake_mem)
- goto wakeup;
- /* if no hierarchy, no match */
- if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
- return 0;
/*
* Both of oom_wait_info->mem and wake_mem are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
- if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
- !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
+ if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
+ && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
return 0;
-
-wakeup:
return autoremove_wake_function(wait, mode, sync, arg);
}
@@ -1534,7 +1950,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
static void memcg_oom_recover(struct mem_cgroup *mem)
{
- if (mem && atomic_read(&mem->oom_lock))
+ if (mem && atomic_read(&mem->under_oom))
memcg_wakeup_oom(mem);
}
@@ -1552,8 +1968,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
need_to_kill = true;
+ mem_cgroup_mark_under_oom(mem);
+
/* At first, try to OOM lock hierarchy under mem.*/
- mutex_lock(&memcg_oom_mutex);
+ spin_lock(&memcg_oom_lock);
locked = mem_cgroup_oom_lock(mem);
/*
* Even if signal_pending(), we can't quit charge() loop without
@@ -1565,7 +1983,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
need_to_kill = false;
if (locked)
mem_cgroup_oom_notify(mem);
- mutex_unlock(&memcg_oom_mutex);
+ spin_unlock(&memcg_oom_lock);
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1574,10 +1992,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
schedule();
finish_wait(&memcg_oom_waitq, &owait.wait);
}
- mutex_lock(&memcg_oom_mutex);
- mem_cgroup_oom_unlock(mem);
+ spin_lock(&memcg_oom_lock);
+ if (locked)
+ mem_cgroup_oom_unlock(mem);
memcg_wakeup_oom(mem);
- mutex_unlock(&memcg_oom_mutex);
+ spin_unlock(&memcg_oom_lock);
+
+ mem_cgroup_unmark_under_oom(mem);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
@@ -1661,17 +2082,19 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat);
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
-#define CHARGE_SIZE (32 * PAGE_SIZE)
+#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
- int charge;
+ unsigned int nr_pages;
struct work_struct work;
+ unsigned long flags;
+#define FLUSHING_CACHED_CHARGE (0)
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static atomic_t memcg_drain_count;
+static DEFINE_MUTEX(percpu_charge_mutex);
/*
- * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
+ * Try to consume stocked charge on this cpu. If success, one page is consumed
* from local stock and true is returned. If the stock is 0 or charges from a
* cgroup which is not current target, returns false. This stock will be
* refilled.
@@ -1682,8 +2105,8 @@ static bool consume_stock(struct mem_cgroup *mem)
bool ret = true;
stock = &get_cpu_var(memcg_stock);
- if (mem == stock->cached && stock->charge)
- stock->charge -= PAGE_SIZE;
+ if (mem == stock->cached && stock->nr_pages)
+ stock->nr_pages--;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
@@ -1697,13 +2120,15 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
- if (stock->charge) {
- res_counter_uncharge(&old->res, stock->charge);
+ if (stock->nr_pages) {
+ unsigned long bytes = stock->nr_pages * PAGE_SIZE;
+
+ res_counter_uncharge(&old->res, bytes);
if (do_swap_account)
- res_counter_uncharge(&old->memsw, stock->charge);
+ res_counter_uncharge(&old->memsw, bytes);
+ stock->nr_pages = 0;
}
stock->cached = NULL;
- stock->charge = 0;
}
/*
@@ -1714,13 +2139,14 @@ static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
drain_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
-static void refill_stock(struct mem_cgroup *mem, int val)
+static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
@@ -1728,46 +2154,81 @@ static void refill_stock(struct mem_cgroup *mem, int val)
drain_stock(stock);
stock->cached = mem;
}
- stock->charge += val;
+ stock->nr_pages += nr_pages;
put_cpu_var(memcg_stock);
}
/*
- * Tries to drain stocked charges in other cpus. This function is asynchronous
- * and just put a work per cpu for draining localy on each cpu. Caller can
- * expects some charges will be back to res_counter later but cannot wait for
- * it.
+ * Drains all per-CPU charge caches for given root_mem resp. subtree
+ * of the hierarchy under it. sync flag says whether we should block
+ * until the work is done.
*/
-static void drain_all_stock_async(void)
+static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
{
- int cpu;
- /* This function is for scheduling "drain" in asynchronous way.
- * The result of "drain" is not directly handled by callers. Then,
- * if someone is calling drain, we don't have to call drain more.
- * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
- * there is a race. We just do loose check here.
- */
- if (atomic_read(&memcg_drain_count))
- return;
+ int cpu, curcpu;
+
/* Notify other cpus that system-wide "drain" is running */
- atomic_inc(&memcg_drain_count);
get_online_cpus();
+ /*
+ * Get a hint for avoiding draining charges on the current cpu,
+ * which must be exhausted by our charging. It is not required that
+ * this be a precise check, so we use raw_smp_processor_id() instead of
+ * getcpu()/putcpu().
+ */
+ curcpu = raw_smp_processor_id();
+ for_each_online_cpu(cpu) {
+ struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
+ struct mem_cgroup *mem;
+
+ mem = stock->cached;
+ if (!mem || !stock->nr_pages)
+ continue;
+ if (!mem_cgroup_same_or_subtree(root_mem, mem))
+ continue;
+ if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (cpu == curcpu)
+ drain_local_stock(&stock->work);
+ else
+ schedule_work_on(cpu, &stock->work);
+ }
+ }
+
+ if (!sync)
+ goto out;
+
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- schedule_work_on(cpu, &stock->work);
+ if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+ flush_work(&stock->work);
}
+out:
put_online_cpus();
- atomic_dec(&memcg_drain_count);
- /* We don't wait for flush_work */
+}
+
+/*
+ * Tries to drain stocked charges in other cpus. This function is asynchronous
+ * and just put a work per cpu for draining localy on each cpu. Caller can
+ * expects some charges will be back to res_counter later but cannot wait for
+ * it.
+ */
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
+{
+ /*
+ * If someone calls draining, avoid adding more kworker runs.
+ */
+ if (!mutex_trylock(&percpu_charge_mutex))
+ return;
+ drain_all_stock(root_mem, false);
+ mutex_unlock(&percpu_charge_mutex);
}
/* This is a synchronous drain interface. */
-static void drain_all_stock_sync(void)
+static void drain_all_stock_sync(struct mem_cgroup *root_mem)
{
/* called when force_empty is called */
- atomic_inc(&memcg_drain_count);
- schedule_on_each_cpu(drain_local_stock);
- atomic_dec(&memcg_drain_count);
+ mutex_lock(&percpu_charge_mutex);
+ drain_all_stock(root_mem, true);
+ mutex_unlock(&percpu_charge_mutex);
}
/*
@@ -1780,11 +2241,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
spin_lock(&mem->pcp_counter_lock);
for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
- s64 x = per_cpu(mem->stat->count[i], cpu);
+ long x = per_cpu(mem->stat->count[i], cpu);
per_cpu(mem->stat->count[i], cpu) = 0;
mem->nocpu_base.count[i] += x;
}
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+ unsigned long x = per_cpu(mem->stat->events[i], cpu);
+
+ per_cpu(mem->stat->events[i], cpu) = 0;
+ mem->nocpu_base.events[i] += x;
+ }
/* need to clear ON_MOVE value, works as a kind of lock. */
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
spin_unlock(&mem->pcp_counter_lock);
@@ -1834,9 +2301,10 @@ enum {
CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
-static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
- int csize, bool oom_check)
+static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+ unsigned int nr_pages, bool oom_check)
{
+ unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long flags = 0;
@@ -1857,22 +2325,21 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
/*
- * csize can be either a huge page (HPAGE_SIZE), a batch of
- * regular pages (CHARGE_SIZE), or a single regular page
- * (PAGE_SIZE).
+ * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
+ * of regular pages (CHARGE_BATCH), or a single regular page (1).
*
* Never reclaim on behalf of optional batching, retry with a
* single page instead.
*/
- if (csize == CHARGE_SIZE)
+ if (nr_pages == CHARGE_BATCH)
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
- if (mem_cgroup_check_margin(mem_over_limit, csize))
+ gfp_mask, flags, NULL);
+ if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
return CHARGE_RETRY;
/*
* Even though the limit is exceeded at this point, reclaim
@@ -1883,7 +2350,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
* unlikely to succeed so close to the limit, and we fall back
* to regular pages anyway in case of failure.
*/
- if (csize == PAGE_SIZE && ret)
+ if (nr_pages == 1 && ret)
return CHARGE_RETRY;
/*
@@ -1909,13 +2376,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
gfp_t gfp_mask,
- struct mem_cgroup **memcg, bool oom,
- int page_size)
+ unsigned int nr_pages,
+ struct mem_cgroup **memcg,
+ bool oom)
{
+ unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem = NULL;
int ret;
- int csize = max(CHARGE_SIZE, (unsigned long) page_size);
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1940,7 +2408,7 @@ again:
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
- if (page_size == PAGE_SIZE && consume_stock(mem))
+ if (nr_pages == 1 && consume_stock(mem))
goto done;
css_get(&mem->css);
} else {
@@ -1963,7 +2431,7 @@ again:
rcu_read_unlock();
goto done;
}
- if (page_size == PAGE_SIZE && consume_stock(mem)) {
+ if (nr_pages == 1 && consume_stock(mem)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
@@ -1998,13 +2466,12 @@ again:
nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
-
+ ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
switch (ret) {
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
- csize = page_size;
+ batch = nr_pages;
css_put(&mem->css);
mem = NULL;
goto again;
@@ -2025,8 +2492,8 @@ again:
}
} while (ret != CHARGE_OK);
- if (csize > page_size)
- refill_stock(mem, csize - page_size);
+ if (batch > nr_pages)
+ refill_stock(mem, batch - nr_pages);
css_put(&mem->css);
done:
*memcg = mem;
@@ -2045,21 +2512,17 @@ bypass:
* gotten by try_charge().
*/
static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
- unsigned long count)
+ unsigned int nr_pages)
{
if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE * count);
+ unsigned long bytes = nr_pages * PAGE_SIZE;
+
+ res_counter_uncharge(&mem->res, bytes);
if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
+ res_counter_uncharge(&mem->memsw, bytes);
}
}
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
- int page_size)
-{
- __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
-}
-
/*
* A helper function to get mem_cgroup from ID. must be called under
* rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -2108,20 +2571,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
}
static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+ struct page *page,
+ unsigned int nr_pages,
struct page_cgroup *pc,
- enum charge_type ctype,
- int page_size)
+ enum charge_type ctype)
{
- int nr_pages = page_size >> PAGE_SHIFT;
-
- /* try_charge() can return NULL to *memcg, taking care of it. */
- if (!mem)
- return;
-
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- mem_cgroup_cancel_charge(mem, page_size);
+ __mem_cgroup_cancel_charge(mem, nr_pages);
return;
}
/*
@@ -2158,7 +2616,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
*/
- memcg_check_events(mem, pc->page);
+ memcg_check_events(mem, page);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2195,7 +2653,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
* We hold lru_lock, then, reduce counter directly.
*/
lru = page_lru(head);
- mz = page_cgroup_zoneinfo(head_pc);
+ mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
}
tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
@@ -2204,7 +2662,9 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
#endif
/**
- * __mem_cgroup_move_account - move account of the page
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
@@ -2212,25 +2672,42 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
- * - the pc is locked, used, and ->mem_cgroup points to @from.
+ * - compound_lock is held when nr_pages > 1
*
* This function doesn't do "charge" nor css_get to new cgroup. It should be
- * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
+ * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
* true, this function does "uncharge" from old cgroup, but it doesn't if
* @uncharge is false, so a caller should do "uncharge".
*/
-
-static void __mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
- int charge_size)
+static int mem_cgroup_move_account(struct page *page,
+ unsigned int nr_pages,
+ struct page_cgroup *pc,
+ struct mem_cgroup *from,
+ struct mem_cgroup *to,
+ bool uncharge)
{
- int nr_pages = charge_size >> PAGE_SHIFT;
+ unsigned long flags;
+ int ret;
VM_BUG_ON(from == to);
- VM_BUG_ON(PageLRU(pc->page));
- VM_BUG_ON(!page_is_cgroup_locked(pc));
- VM_BUG_ON(!PageCgroupUsed(pc));
- VM_BUG_ON(pc->mem_cgroup != from);
+ VM_BUG_ON(PageLRU(page));
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ ret = -EBUSY;
+ if (nr_pages > 1 && !PageTransHuge(page))
+ goto out;
+
+ lock_page_cgroup(pc);
+
+ ret = -EINVAL;
+ if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
+ goto unlock;
+
+ move_lock_page_cgroup(pc, &flags);
if (PageCgroupFileMapped(pc)) {
/* Update mapped_file data for mem_cgroup */
@@ -2242,7 +2719,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
- mem_cgroup_cancel_charge(from, charge_size);
+ __mem_cgroup_cancel_charge(from, nr_pages);
/* caller should have done css_get */
pc->mem_cgroup = to;
@@ -2251,43 +2728,19 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
* this function is just force_empty() and move charge, so it's
- * garanteed that "to" is never removed. So, we don't check rmdir
+ * guaranteed that "to" is never removed. So, we don't check rmdir
* status here.
*/
-}
-
-/*
- * check whether the @pc is valid for moving account and call
- * __mem_cgroup_move_account()
- */
-static int mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to,
- bool uncharge, int charge_size)
-{
- int ret = -EINVAL;
- unsigned long flags;
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
- return -EBUSY;
-
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
- move_lock_page_cgroup(pc, &flags);
- __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
- move_unlock_page_cgroup(pc, &flags);
- ret = 0;
- }
+ move_unlock_page_cgroup(pc, &flags);
+ ret = 0;
+unlock:
unlock_page_cgroup(pc);
/*
* check events
*/
- memcg_check_events(to, pc->page);
- memcg_check_events(from, pc->page);
+ memcg_check_events(to, page);
+ memcg_check_events(from, page);
+out:
return ret;
}
@@ -2295,16 +2748,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
* move charges to its parent.
*/
-static int mem_cgroup_move_parent(struct page_cgroup *pc,
+static int mem_cgroup_move_parent(struct page *page,
+ struct page_cgroup *pc,
struct mem_cgroup *child,
gfp_t gfp_mask)
{
- struct page *page = pc->page;
struct cgroup *cg = child->css.cgroup;
struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent;
- int page_size = PAGE_SIZE;
- unsigned long flags;
+ unsigned int nr_pages;
+ unsigned long uninitialized_var(flags);
int ret;
/* Is ROOT ? */
@@ -2317,23 +2770,21 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
if (isolate_lru_page(page))
goto put;
- if (PageTransHuge(page))
- page_size = HPAGE_SIZE;
+ nr_pages = hpage_nr_pages(page);
parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask,
- &parent, false, page_size);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
if (ret || !parent)
goto put_back;
- if (page_size > PAGE_SIZE)
+ if (nr_pages > 1)
flags = compound_lock_irqsave(page);
- ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
+ ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
if (ret)
- mem_cgroup_cancel_charge(parent, page_size);
+ __mem_cgroup_cancel_charge(parent, nr_pages);
- if (page_size > PAGE_SIZE)
+ if (nr_pages > 1)
compound_unlock_irqrestore(page, flags);
put_back:
putback_lru_page(page);
@@ -2353,13 +2804,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype)
{
struct mem_cgroup *mem = NULL;
- int page_size = PAGE_SIZE;
+ unsigned int nr_pages = 1;
struct page_cgroup *pc;
bool oom = true;
int ret;
if (PageTransHuge(page)) {
- page_size <<= compound_order(page);
+ nr_pages <<= compound_order(page);
VM_BUG_ON(!PageTransHuge(page));
/*
* Never OOM-kill a process for a huge page. The
@@ -2369,16 +2820,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
}
pc = lookup_page_cgroup(page);
- /* can happen at boot */
- if (unlikely(!pc))
- return 0;
- prefetchw(pc);
+ BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
- ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
if (ret || !mem)
return ret;
- __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
+ __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
return 0;
}
@@ -2406,51 +2854,52 @@ static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype);
+static void
+__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
+ enum charge_type ctype)
+{
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+ /*
+ * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
+ * is already on LRU. It means the page may on some other page_cgroup's
+ * LRU. Take care of it.
+ */
+ mem_cgroup_lru_del_before_commit(page);
+ __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+ mem_cgroup_lru_add_after_commit(page);
+ return;
+}
+
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
+ struct mem_cgroup *mem = NULL;
int ret;
if (mem_cgroup_disabled())
return 0;
if (PageCompound(page))
return 0;
- /*
- * Corner case handling. This is called from add_to_page_cache()
- * in usual. But some FS (shmem) precharges this page before calling it
- * and call add_to_page_cache() with GFP_NOWAIT.
- *
- * For GFP_NOWAIT case, the page may be pre-charged before calling
- * add_to_page_cache(). (See shmem.c) check it here and avoid to call
- * charge twice. (It works but has to pay a bit larger cost.)
- * And when the page is SwapCache, it should take swap information
- * into account. This is under lock_page() now.
- */
- if (!(gfp_mask & __GFP_WAIT)) {
- struct page_cgroup *pc;
-
- pc = lookup_page_cgroup(page);
- if (!pc)
- return 0;
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc)) {
- unlock_page_cgroup(pc);
- return 0;
- }
- unlock_page_cgroup(pc);
- }
if (unlikely(!mm))
mm = &init_mm;
- if (page_is_file_cache(page))
- return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_CACHE);
+ if (page_is_file_cache(page)) {
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
+ if (ret || !mem)
+ return ret;
+ /*
+ * FUSE reuses pages without going through the final
+ * put that would remove them from the LRU list, make
+ * sure that they get relinked properly.
+ */
+ __mem_cgroup_commit_charge_lrucare(page, mem,
+ MEM_CGROUP_CHARGE_TYPE_CACHE);
+ return ret;
+ }
/* shmem */
if (PageSwapCache(page)) {
- struct mem_cgroup *mem = NULL;
-
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, mem,
@@ -2475,6 +2924,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct mem_cgroup *mem;
int ret;
+ *ptr = NULL;
+
if (mem_cgroup_disabled())
return 0;
@@ -2492,30 +2943,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
if (!mem)
goto charge_cur_mm;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
+ ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
+ return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
}
static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype)
{
- struct page_cgroup *pc;
-
if (mem_cgroup_disabled())
return;
if (!ptr)
return;
cgroup_exclude_rmdir(&ptr->css);
- pc = lookup_page_cgroup(page);
- mem_cgroup_lru_del_before_commit_swapcache(page);
- __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
- mem_cgroup_lru_add_after_commit_swapcache(page);
+
+ __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
@@ -2563,15 +3010,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
- mem_cgroup_cancel_charge(mem, PAGE_SIZE);
+ __mem_cgroup_cancel_charge(mem, 1);
}
-static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
- int page_size)
+static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
+ unsigned int nr_pages,
+ const enum charge_type ctype)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
+
/* If swapout, usage of swap doesn't decrease */
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
uncharge_memsw = false;
@@ -2586,7 +3034,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
batch->memcg = mem;
/*
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
- * In those cases, all pages freed continously can be expected to be in
+ * In those cases, all pages freed continuously can be expected to be in
* the same cgroup and we have chance to coalesce uncharges.
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
* because we want to do uncharge as soon as possible.
@@ -2595,7 +3043,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
- if (page_size != PAGE_SIZE)
+ if (nr_pages > 1)
goto direct_uncharge;
/*
@@ -2606,14 +3054,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
if (batch->memcg != mem)
goto direct_uncharge;
/* remember freed charge and uncharge it later */
- batch->bytes += PAGE_SIZE;
+ batch->nr_pages++;
if (uncharge_memsw)
- batch->memsw_bytes += PAGE_SIZE;
+ batch->memsw_nr_pages++;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, page_size);
+ res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, page_size);
+ res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
if (unlikely(batch->memcg != mem))
memcg_oom_recover(mem);
return;
@@ -2625,10 +3073,9 @@ direct_uncharge:
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
- int count;
- struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
- int page_size = PAGE_SIZE;
+ unsigned int nr_pages = 1;
+ struct page_cgroup *pc;
if (mem_cgroup_disabled())
return NULL;
@@ -2637,11 +3084,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
return NULL;
if (PageTransHuge(page)) {
- page_size <<= compound_order(page);
+ nr_pages <<= compound_order(page);
VM_BUG_ON(!PageTransHuge(page));
}
-
- count = page_size >> PAGE_SHIFT;
/*
* Check if our page_cgroup is valid
*/
@@ -2674,7 +3119,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
+ mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
ClearPageCgroupUsed(pc);
/*
@@ -2695,7 +3140,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
mem_cgroup_get(mem);
}
if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype, page_size);
+ mem_cgroup_do_uncharge(mem, nr_pages, ctype);
return mem;
@@ -2735,8 +3180,8 @@ void mem_cgroup_uncharge_start(void)
/* We can do nest. */
if (current->memcg_batch.do_batch == 1) {
current->memcg_batch.memcg = NULL;
- current->memcg_batch.bytes = 0;
- current->memcg_batch.memsw_bytes = 0;
+ current->memcg_batch.nr_pages = 0;
+ current->memcg_batch.memsw_nr_pages = 0;
}
}
@@ -2757,10 +3202,12 @@ void mem_cgroup_uncharge_end(void)
* This "batch->memcg" is valid without any css_get/put etc...
* bacause we hide charges behind us.
*/
- if (batch->bytes)
- res_counter_uncharge(&batch->memcg->res, batch->bytes);
- if (batch->memsw_bytes)
- res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+ if (batch->nr_pages)
+ res_counter_uncharge(&batch->memcg->res,
+ batch->nr_pages * PAGE_SIZE);
+ if (batch->memsw_nr_pages)
+ res_counter_uncharge(&batch->memcg->memsw,
+ batch->memsw_nr_pages * PAGE_SIZE);
memcg_oom_recover(batch->memcg);
/* forget this pointer (for sanity check) */
batch->memcg = NULL;
@@ -2883,13 +3330,15 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
* page belongs to.
*/
int mem_cgroup_prepare_migration(struct page *page,
- struct page *newpage, struct mem_cgroup **ptr)
+ struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
{
- struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ struct page_cgroup *pc;
enum charge_type ctype;
int ret = 0;
+ *ptr = NULL;
+
VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return 0;
@@ -2940,7 +3389,7 @@ int mem_cgroup_prepare_migration(struct page *page,
return 0;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
css_put(&mem->css);/* drop extra refcnt */
if (ret || *ptr == NULL) {
if (PageAnon(page)) {
@@ -2967,7 +3416,7 @@ int mem_cgroup_prepare_migration(struct page *page,
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
+ __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
return ret;
}
@@ -3020,30 +3469,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
cgroup_release_and_wakeup_rmdir(&mem->css);
}
-/*
- * A call to try to shrink memory usage on charge failure at shmem's swapin.
- * Calling hierarchical_reclaim is not enough because we should update
- * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
- * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
- * not from the memcg which this page would be charged to.
- * try_charge_swapin does all of these works properly.
- */
-int mem_cgroup_shmem_charge_fallback(struct page *page,
- struct mm_struct *mm,
- gfp_t gfp_mask)
+#ifdef CONFIG_DEBUG_VM
+static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
{
- struct mem_cgroup *mem = NULL;
- int ret;
+ struct page_cgroup *pc;
+
+ pc = lookup_page_cgroup(page);
+ if (likely(pc) && PageCgroupUsed(pc))
+ return pc;
+ return NULL;
+}
+bool mem_cgroup_bad_page_check(struct page *page)
+{
if (mem_cgroup_disabled())
- return 0;
+ return false;
- ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
- if (!ret)
- mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
+ return lookup_page_cgroup_used(page) != NULL;
+}
- return ret;
+void mem_cgroup_print_bad_page(struct page *page)
+{
+ struct page_cgroup *pc;
+
+ pc = lookup_page_cgroup_used(page);
+ if (pc) {
+ int ret = -1;
+ char *path;
+
+ printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
+ pc, pc->flags, pc->mem_cgroup);
+
+ path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (path) {
+ rcu_read_lock();
+ ret = cgroup_path(pc->mem_cgroup->css.cgroup,
+ path, PATH_MAX);
+ rcu_read_unlock();
+ }
+
+ printk(KERN_CONT "(%s)\n",
+ (ret < 0) ? "cannot get the path" : path);
+ kfree(path);
+ }
}
+#endif
static DEFINE_MUTEX(set_limit_mutex);
@@ -3102,7 +3572,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
break;
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
- MEM_CGROUP_RECLAIM_SHRINK);
+ MEM_CGROUP_RECLAIM_SHRINK,
+ NULL);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -3162,7 +3633,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
MEM_CGROUP_RECLAIM_NOSWAP |
- MEM_CGROUP_RECLAIM_SHRINK);
+ MEM_CGROUP_RECLAIM_SHRINK,
+ NULL);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -3176,7 +3648,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
- gfp_t gfp_mask)
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -3184,6 +3657,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
unsigned long long excess;
+ unsigned long nr_scanned;
if (order > 0)
return 0;
@@ -3202,10 +3676,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
if (!mz)
break;
+ nr_scanned = 0;
reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
gfp_mask,
- MEM_CGROUP_RECLAIM_SOFT);
+ MEM_CGROUP_RECLAIM_SOFT,
+ &nr_scanned);
nr_reclaimed += reclaimed;
+ *total_scanned += nr_scanned;
spin_lock(&mctz->lock);
/*
@@ -3228,10 +3705,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
*/
next_mz =
__mem_cgroup_largest_soft_limit_node(mctz);
- if (next_mz == mz) {
+ if (next_mz == mz)
css_put(&next_mz->mem->css);
- next_mz = NULL;
- } else /* next_mz == NULL or other memcg */
+ else /* next_mz == NULL or other memcg */
break;
} while (1);
}
@@ -3288,6 +3764,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
loop += 256;
busy = NULL;
while (loop--) {
+ struct page *page;
+
ret = 0;
spin_lock_irqsave(&zone->lru_lock, flags);
if (list_empty(list)) {
@@ -3303,7 +3781,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
}
spin_unlock_irqrestore(&zone->lru_lock, flags);
- ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
+ page = lookup_cgroup_page(pc);
+
+ ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
if (ret == -ENOMEM)
break;
@@ -3347,7 +3827,7 @@ move_account:
goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
- drain_all_stock_sync();
+ drain_all_stock_sync(mem);
ret = 0;
mem_cgroup_start_move(mem);
for_each_node_state(node, N_HIGH_MEMORY) {
@@ -3386,14 +3866,18 @@ try_to_free:
/* try to free all pages in this cgroup */
shrink = 1;
while (nr_retries && mem->res.usage > 0) {
+ struct memcg_scanrecord rec;
int progress;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
}
+ rec.context = SCAN_BY_SHRINK;
+ rec.mem = mem;
+ rec.root = mem;
progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
- false, get_swappiness(mem));
+ false, &rec);
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
@@ -3451,13 +3935,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
}
-static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
- enum mem_cgroup_stat_index idx)
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
{
struct mem_cgroup *iter;
- s64 val = 0;
+ long val = 0;
- /* each per cpu's value can be minus.Then, use s64 */
+ /* Per-cpu values can be negative, use a signed accumulator */
for_each_mem_cgroup_tree(iter, mem)
val += mem_cgroup_read_stat(iter, idx);
@@ -3477,12 +3961,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
return res_counter_read_u64(&mem->memsw, RES_USAGE);
}
- val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
- val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
+ val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
if (swap)
- val += mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_SWAPOUT);
+ val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
return val << PAGE_SHIFT;
}
@@ -3660,6 +4143,8 @@ enum {
MCS_PGPGIN,
MCS_PGPGOUT,
MCS_SWAP,
+ MCS_PGFAULT,
+ MCS_PGMAJFAULT,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
@@ -3682,6 +4167,8 @@ struct {
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"swap", "total_swap"},
+ {"pgfault", "total_pgfault"},
+ {"pgmajfault", "total_pgmajfault"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
@@ -3702,25 +4189,29 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
s->stat[MCS_RSS] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
+ s->stat[MCS_PGFAULT] += val;
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
+ s->stat[MCS_PGMAJFAULT] += val;
/* per zone stat */
- val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
+ val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
+ val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
+ val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
+ val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
+ val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
}
@@ -3733,6 +4224,53 @@ mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
mem_cgroup_get_local_stat(iter, s);
}
+#ifdef CONFIG_NUMA
+static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
+{
+ int nid;
+ unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
+ unsigned long node_nr;
+ struct cgroup *cont = m->private;
+ struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+
+ total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
+ seq_printf(m, "total=%lu", total_nr);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+
+ file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
+ seq_printf(m, "file=%lu", file_nr);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+ LRU_ALL_FILE);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+
+ anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
+ seq_printf(m, "anon=%lu", anon_nr);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+ LRU_ALL_ANON);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+
+ unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
+ seq_printf(m, "unevictable=%lu", unevictable_nr);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+ BIT(LRU_UNEVICTABLE));
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+ return 0;
+}
+#endif /* CONFIG_NUMA */
+
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
struct cgroup_map_cb *cb)
{
@@ -3743,6 +4281,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_local_stat(mem_cont, &mystat);
+
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
@@ -3802,7 +4341,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
- return get_swappiness(memcg);
+ return mem_cgroup_swappiness(memcg);
}
static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
@@ -3828,9 +4367,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
return -EINVAL;
}
- spin_lock(&memcg->reclaim_param_lock);
memcg->swappiness = val;
- spin_unlock(&memcg->reclaim_param_lock);
cgroup_unlock();
@@ -4094,15 +4631,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
if (!event)
return -ENOMEM;
- mutex_lock(&memcg_oom_mutex);
+ spin_lock(&memcg_oom_lock);
event->eventfd = eventfd;
list_add(&event->list, &memcg->oom_notify);
/* already in OOM ? */
- if (atomic_read(&memcg->oom_lock))
+ if (atomic_read(&memcg->under_oom))
eventfd_signal(eventfd, 1);
- mutex_unlock(&memcg_oom_mutex);
+ spin_unlock(&memcg_oom_lock);
return 0;
}
@@ -4116,7 +4653,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
BUG_ON(type != _OOM_TYPE);
- mutex_lock(&memcg_oom_mutex);
+ spin_lock(&memcg_oom_lock);
list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
if (ev->eventfd == eventfd) {
@@ -4125,7 +4662,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
}
}
- mutex_unlock(&memcg_oom_mutex);
+ spin_unlock(&memcg_oom_lock);
}
static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
@@ -4135,7 +4672,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
- if (atomic_read(&mem->oom_lock))
+ if (atomic_read(&mem->under_oom))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
@@ -4168,6 +4705,70 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
return 0;
}
+#ifdef CONFIG_NUMA
+static const struct file_operations mem_control_numa_stat_file_operations = {
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
+{
+ struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
+
+ file->f_op = &mem_control_numa_stat_file_operations;
+ return single_open(file, mem_control_numa_stat_show, cont);
+}
+#endif /* CONFIG_NUMA */
+
+static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
+ struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ char string[64];
+ int i;
+
+ for (i = 0; i < NR_SCANSTATS; i++) {
+ strcpy(string, scanstat_string[i]);
+ strcat(string, SCANSTAT_WORD_LIMIT);
+ cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);
+ }
+
+ for (i = 0; i < NR_SCANSTATS; i++) {
+ strcpy(string, scanstat_string[i]);
+ strcat(string, SCANSTAT_WORD_SYSTEM);
+ cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
+ }
+
+ for (i = 0; i < NR_SCANSTATS; i++) {
+ strcpy(string, scanstat_string[i]);
+ strcat(string, SCANSTAT_WORD_LIMIT);
+ strcat(string, SCANSTAT_WORD_HIERARCHY);
+ cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
+ }
+ for (i = 0; i < NR_SCANSTATS; i++) {
+ strcpy(string, scanstat_string[i]);
+ strcat(string, SCANSTAT_WORD_SYSTEM);
+ strcat(string, SCANSTAT_WORD_HIERARCHY);
+ cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
+ }
+ return 0;
+}
+
+static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
+ unsigned int event)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+
+ spin_lock(&mem->scanstat.lock);
+ memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
+ memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
+ spin_unlock(&mem->scanstat.lock);
+ return 0;
+}
+
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -4231,6 +4832,18 @@ static struct cftype mem_cgroup_files[] = {
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .open = mem_control_numa_stat_open,
+ .mode = S_IRUGO,
+ },
+#endif
+ {
+ .name = "vmscan_stat",
+ .read_map = mem_cgroup_vmscan_stat_read,
+ .trigger = mem_cgroup_reset_vmscan_stat,
+ },
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4486,14 +5099,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
res_counter_init(&mem->memsw, NULL);
}
mem->last_scanned_child = 0;
- spin_lock_init(&mem->reclaim_param_lock);
+ mem->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&mem->oom_notify);
if (parent)
- mem->swappiness = get_swappiness(parent);
+ mem->swappiness = mem_cgroup_swappiness(parent);
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
+ spin_lock_init(&mem->scanstat.lock);
return &mem->css;
free_out:
__mem_cgroup_free(mem);
@@ -4574,8 +5188,7 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
- PAGE_SIZE);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
if (ret || !mem)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
@@ -4675,15 +5288,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
- if (!mapping_cap_swap_backed(mapping)) { /* normal file */
- page = find_get_page(mapping, pgoff);
- } else { /* shmem/tmpfs file. we should take account of swap too. */
- swp_entry_t ent;
- mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
+ page = find_get_page(mapping, pgoff);
+
+#ifdef CONFIG_SWAP
+ /* shmem/tmpfs may report page out on swap: account for that too. */
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swap = radix_to_swp_entry(page);
if (do_swap_account)
- entry->val = ent.val;
+ *entry = swap;
+ page = find_get_page(&swapper_space, swap.val);
}
-
+#endif
return page;
}
@@ -4737,7 +5352,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- VM_BUG_ON(pmd_trans_huge(*pmd));
+ split_huge_page_pmd(walk->mm, pmd);
+
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4844,8 +5460,7 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
int ret = 0;
struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -4884,8 +5499,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
mem_cgroup_clear_mc();
}
@@ -4899,8 +5513,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
+ split_huge_page_pmd(walk->mm, pmd);
retry:
- VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
@@ -4920,8 +5534,8 @@ retry:
if (isolate_lru_page(page))
goto put;
pc = lookup_page_cgroup(page);
- if (!mem_cgroup_move_account(pc,
- mc.from, mc.to, false, PAGE_SIZE)) {
+ if (!mem_cgroup_move_account(page, 1, pc,
+ mc.from, mc.to, false)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -5003,41 +5617,35 @@ retry:
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
- struct mm_struct *mm;
-
- if (!mc.to)
- /* no need to move charge */
- return;
+ struct mm_struct *mm = get_task_mm(p);
- mm = get_task_mm(p);
if (mm) {
- mem_cgroup_move_charge(mm);
+ if (mc.to)
+ mem_cgroup_move_charge(mm);
+ put_swap_token(mm);
mmput(mm);
}
- mem_cgroup_clear_mc();
+ if (mc.to)
+ mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
#endif
@@ -5060,19 +5668,12 @@ struct cgroup_subsys mem_cgroup_subsys = {
static int __init enable_swap_account(char *s)
{
/* consider enabled if no parameter or 1 is given */
- if (!(*s) || !strcmp(s, "=1"))
+ if (!strcmp(s, "1"))
really_do_swap_account = 1;
- else if (!strcmp(s, "=0"))
+ else if (!strcmp(s, "0"))
really_do_swap_account = 0;
return 1;
}
-__setup("swapaccount", enable_swap_account);
+__setup("swapaccount=", enable_swap_account);
-static int __init disable_swap_account(char *s)
-{
- printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
- enable_swap_account("=0");
- return 1;
-}
-__setup("noswapaccount", disable_swap_account);
#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0207c2f6f8b..2b43ba051ac 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,8 @@
#include <linux/swapops.h>
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
+#include <linux/kfifo.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -208,7 +210,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
* This could cause a loop when the user sets SIGBUS
- * to SIG_IGN, but hopefully noone will do that?
+ * to SIG_IGN, but hopefully no one will do that?
*/
ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
if (ret < 0)
@@ -239,7 +241,11 @@ void shake_page(struct page *p, int access)
if (access) {
int nr;
do {
- nr = shrink_slab(1000, GFP_KERNEL, 1000);
+ struct shrink_control shrink = {
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ nr = shrink_slab(&shrink, 1000, 1000);
if (page_count(p) == 1)
break;
} while (nr > 10);
@@ -386,10 +392,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct task_struct *tsk;
struct anon_vma *av;
- read_lock(&tasklist_lock);
av = page_lock_anon_vma(page);
if (av == NULL) /* Not actually mapped anymore */
- goto out;
+ return;
+
+ read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
@@ -403,9 +410,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
- page_unlock_anon_vma(av);
-out:
read_unlock(&tasklist_lock);
+ page_unlock_anon_vma(av);
}
/*
@@ -419,17 +425,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
struct prio_tree_iter iter;
struct address_space *mapping = page->mapping;
- /*
- * A note on the locking order between the two locks.
- * We don't rely on this particular order.
- * If you have some other code that needs a different order
- * feel free to switch them around. Or add a reverse link
- * from mm_struct to task_struct, then this could be all
- * done without taking tasklist_lock and looping over all tasks.
- */
-
+ mutex_lock(&mapping->i_mmap_mutex);
read_lock(&tasklist_lock);
- spin_lock(&mapping->i_mmap_lock);
for_each_process(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -449,8 +446,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
- spin_unlock(&mapping->i_mmap_lock);
read_unlock(&tasklist_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
@@ -634,7 +631,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* when the page is reread or dropped. If an
* application assumes it will always get error on
* fsync, but does other operations on the fd before
- * and the page is dropped inbetween then the error
+ * and the page is dropped between then the error
* will not be properly reported.
*
* This can already happen even without hwpoisoned
@@ -728,7 +725,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* The table matches them in order and calls the right handler.
*
* This is quite tricky because we can access page at any time
- * in its live cycle, so all accesses have to be extremly careful.
+ * in its live cycle, so all accesses have to be extremely careful.
*
* This is not complete. More states could be added.
* For any missing state don't attempt recovery.
@@ -945,7 +942,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
collect_procs(ppage, &tokill);
if (hpage != ppage)
- lock_page_nosync(ppage);
+ lock_page(ppage);
ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
@@ -1038,7 +1035,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* Check "just unpoisoned", "filter hit", and
* "race with other subpage."
*/
- lock_page_nosync(hpage);
+ lock_page(hpage);
if (!PageHWPoison(hpage)
|| (hwpoison_filter(p) && TestClearPageHWPoison(p))
|| (p != hpage && TestSetPageHWPoison(hpage))) {
@@ -1088,7 +1085,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
- lock_page_nosync(hpage);
+ lock_page(hpage);
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1130,7 +1127,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
/*
* Now take care of user space mappings.
- * Abort on fail: __remove_from_page_cache() assumes unmapped page.
+ * Abort on fail: __delete_from_page_cache() assumes unmapped page.
*/
if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
@@ -1182,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno)
__memory_failure(pfn, trapno, 0);
}
+#define MEMORY_FAILURE_FIFO_ORDER 4
+#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
+
+struct memory_failure_entry {
+ unsigned long pfn;
+ int trapno;
+ int flags;
+};
+
+struct memory_failure_cpu {
+ DECLARE_KFIFO(fifo, struct memory_failure_entry,
+ MEMORY_FAILURE_FIFO_SIZE);
+ spinlock_t lock;
+ struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
+
+/**
+ * memory_failure_queue - Schedule handling memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: Flags for memory failure handling
+ *
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Can run in IRQ context.
+ */
+void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+ struct memory_failure_cpu *mf_cpu;
+ unsigned long proc_flags;
+ struct memory_failure_entry entry = {
+ .pfn = pfn,
+ .trapno = trapno,
+ .flags = flags,
+ };
+
+ mf_cpu = &get_cpu_var(memory_failure_cpu);
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ if (kfifo_put(&mf_cpu->fifo, &entry))
+ schedule_work_on(smp_processor_id(), &mf_cpu->work);
+ else
+ pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
+ pfn);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+
+static void memory_failure_work_func(struct work_struct *work)
+{
+ struct memory_failure_cpu *mf_cpu;
+ struct memory_failure_entry entry = { 0, };
+ unsigned long proc_flags;
+ int gotten;
+
+ mf_cpu = &__get_cpu_var(memory_failure_cpu);
+ for (;;) {
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ gotten = kfifo_get(&mf_cpu->fifo, &entry);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ if (!gotten)
+ break;
+ __memory_failure(entry.pfn, entry.trapno, entry.flags);
+ }
+}
+
+static int __init memory_failure_init(void)
+{
+ struct memory_failure_cpu *mf_cpu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ spin_lock_init(&mf_cpu->lock);
+ INIT_KFIFO(mf_cpu->fifo);
+ INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+ }
+
+ return 0;
+}
+core_initcall(memory_failure_init);
+
/**
* unpoison_memory - Unpoison a previously poisoned page
* @pfn: Page number of the to be unpoisoned page
@@ -1231,7 +1319,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- lock_page_nosync(page);
+ lock_page(page);
/*
* This test is racy because PG_hwpoison is set outside of page lock.
* That's acceptable because that won't trigger kernel panic. Instead,
@@ -1440,16 +1528,12 @@ int soft_offline_page(struct page *page, int flags)
*/
ret = invalidate_inode_page(page);
unlock_page(page);
-
/*
- * Drop count because page migration doesn't like raised
- * counts. The page could get re-allocated, but if it becomes
- * LRU the isolation will just fail.
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- put_page(page);
if (ret == 1) {
+ put_page(page);
ret = 0;
pr_info("soft_offline: %#lx: invalidated\n", pfn);
goto done;
@@ -1461,9 +1545,15 @@ int soft_offline_page(struct page *page, int flags)
* handles a large number of cases for us.
*/
ret = isolate_lru_page(page);
+ /*
+ * Drop page reference which is came from get_any_page()
+ * successful isolate_lru_page() already took another one.
+ */
+ put_page(page);
if (!ret) {
LIST_HEAD(pagelist);
-
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
0, true);
@@ -1487,35 +1577,3 @@ done:
/* keep elevated page count for bad page */
return ret;
}
-
-/*
- * The caller must hold current->mm->mmap_sem in read mode.
- */
-int is_hwpoison_address(unsigned long addr)
-{
- pgd_t *pgdp;
- pud_t pud, *pudp;
- pmd_t pmd, *pmdp;
- pte_t pte, *ptep;
- swp_entry_t entry;
-
- pgdp = pgd_offset(current->mm, addr);
- if (!pgd_present(*pgdp))
- return 0;
- pudp = pud_offset(pgdp, addr);
- pud = *pudp;
- if (!pud_present(pud) || pud_large(pud))
- return 0;
- pmdp = pmd_offset(pudp, addr);
- pmd = *pmdp;
- if (!pmd_present(pmd) || pmd_large(pmd))
- return 0;
- ptep = pte_offset_map(pmdp, addr);
- pte = *ptep;
- pte_unmap(ptep);
- if (!is_swap_pte(pte))
- return 0;
- entry = pte_to_swp_entry(pte);
- return is_hwpoison_entry(entry);
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/mm/memory.c b/mm/memory.c
index 5823698c2b7..a56e3ba816b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -182,7 +182,7 @@ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
{
__sync_task_rss_stat(task, mm);
}
-#else
+#else /* SPLIT_RSS_COUNTING */
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
@@ -191,7 +191,205 @@ static void check_sync_rss_stat(struct task_struct *task)
{
}
+#endif /* SPLIT_RSS_COUNTING */
+
+#ifdef HAVE_GENERIC_MMU_GATHER
+
+static int tlb_next_batch(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ batch = tlb->active;
+ if (batch->next) {
+ tlb->active = batch->next;
+ return 1;
+ }
+
+ batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+ if (!batch)
+ return 0;
+
+ batch->next = NULL;
+ batch->nr = 0;
+ batch->max = MAX_GATHER_BATCH;
+
+ tlb->active->next = batch;
+ tlb->active = batch;
+
+ return 1;
+}
+
+/* tlb_gather_mmu
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @fullmm argument is used when @mm is without
+ * users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
+{
+ tlb->mm = mm;
+
+ tlb->fullmm = fullmm;
+ tlb->need_flush = 0;
+ tlb->fast_mode = (num_possible_cpus() == 1);
+ tlb->local.next = NULL;
+ tlb->local.nr = 0;
+ tlb->local.max = ARRAY_SIZE(tlb->__pages);
+ tlb->active = &tlb->local;
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb->batch = NULL;
#endif
+}
+
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ if (!tlb->need_flush)
+ return;
+ tlb->need_flush = 0;
+ tlb_flush(tlb);
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb_table_flush(tlb);
+#endif
+
+ if (tlb_fast_mode(tlb))
+ return;
+
+ for (batch = &tlb->local; batch; batch = batch->next) {
+ free_pages_and_swap_cache(batch->pages, batch->nr);
+ batch->nr = 0;
+ }
+ tlb->active = &tlb->local;
+}
+
+/* tlb_finish_mmu
+ * Called at the end of the shootdown operation to free up any resources
+ * that were required.
+ */
+void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+ struct mmu_gather_batch *batch, *next;
+
+ tlb_flush_mmu(tlb);
+
+ /* keep the page table cache within bounds */
+ check_pgt_cache();
+
+ for (batch = tlb->local.next; batch; batch = next) {
+ next = batch->next;
+ free_pages((unsigned long)batch, 0);
+ }
+ tlb->local.next = NULL;
+}
+
+/* __tlb_remove_page
+ * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
+ * handling the additional races in SMP caused by other CPUs caching valid
+ * mappings in their TLBs. Returns the number of free page slots left.
+ * When out of page slots we must call tlb_flush_mmu().
+ */
+int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ struct mmu_gather_batch *batch;
+
+ tlb->need_flush = 1;
+
+ if (tlb_fast_mode(tlb)) {
+ free_page_and_swap_cache(page);
+ return 1; /* avoid calling tlb_flush_mmu() */
+ }
+
+ batch = tlb->active;
+ batch->pages[batch->nr++] = page;
+ if (batch->nr == batch->max) {
+ if (!tlb_next_batch(tlb))
+ return 0;
+ batch = tlb->active;
+ }
+ VM_BUG_ON(batch->nr > batch->max);
+
+ return batch->max - batch->nr;
+}
+
+#endif /* HAVE_GENERIC_MMU_GATHER */
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+ /* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ /*
+ * This isn't an RCU grace period and hence the page-tables cannot be
+ * assumed to be actually RCU-freed.
+ *
+ * It is however sufficient for software page-table walkers that rely on
+ * IRQ disabling. See the comment near struct mmu_table_batch.
+ */
+ smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ __tlb_remove_table(table);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+ struct mmu_table_batch *batch;
+ int i;
+
+ batch = container_of(head, struct mmu_table_batch, rcu);
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch) {
+ call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+ *batch = NULL;
+ }
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ tlb->need_flush = 1;
+
+ /*
+ * When there's less then two users of this mm there cannot be a
+ * concurrent page-table walk.
+ */
+ if (atomic_read(&tlb->mm->mm_users) < 2) {
+ __tlb_remove_table(table);
+ return;
+ }
+
+ if (*batch == NULL) {
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (*batch == NULL) {
+ tlb_remove_table_one(table);
+ return;
+ }
+ (*batch)->nr = 0;
+ }
+ (*batch)->tables[(*batch)->nr++] = table;
+ if ((*batch)->nr == MAX_TABLE_BATCH)
+ tlb_table_flush(tlb);
+}
+
+#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
/*
* If a p?d_bad entry is found while walking page tables, report
@@ -533,7 +731,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
add_taint(TAINT_BAD_PAGE);
}
-static inline int is_cow_mapping(unsigned int flags)
+static inline int is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
@@ -909,26 +1107,26 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
- pte_t *pte;
- spinlock_t *ptl;
+ int force_flush = 0;
int rss[NR_MM_COUNTERS];
+ spinlock_t *ptl;
+ pte_t *start_pte;
+ pte_t *pte;
+again:
init_rss_vec(rss);
-
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte = start_pte;
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
if (pte_none(ptent)) {
- (*zap_work)--;
continue;
}
- (*zap_work) -= PAGE_SIZE;
-
if (pte_present(ptent)) {
struct page *page;
@@ -974,7 +1172,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
- tlb_remove_page(tlb, page);
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush)
+ break;
continue;
}
/*
@@ -995,11 +1195,23 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
print_bad_pte(vma, addr, ptent, NULL);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+ } while (pte++, addr += PAGE_SIZE, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(pte - 1, ptl);
+ pte_unmap_unlock(start_pte, ptl);
+
+ /*
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (addr != end)
+ goto again;
+ }
return addr;
}
@@ -1007,7 +1219,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
pmd_t *pmd;
unsigned long next;
@@ -1019,19 +1231,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (next-addr != HPAGE_PMD_SIZE) {
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
split_huge_page_pmd(vma->vm_mm, pmd);
- } else if (zap_huge_pmd(tlb, vma, pmd)) {
- (*zap_work)--;
+ } else if (zap_huge_pmd(tlb, vma, pmd))
continue;
- }
/* fall through */
}
- if (pmd_none_or_clear_bad(pmd)) {
- (*zap_work)--;
+ if (pmd_none_or_clear_bad(pmd))
continue;
- }
- next = zap_pte_range(tlb, vma, pmd, addr, next,
- zap_work, details);
- } while (pmd++, addr = next, (addr != end && *zap_work > 0));
+ next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+ cond_resched();
+ } while (pmd++, addr = next, addr != end);
return addr;
}
@@ -1039,7 +1247,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
pud_t *pud;
unsigned long next;
@@ -1047,13 +1255,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud)) {
- (*zap_work)--;
+ if (pud_none_or_clear_bad(pud))
continue;
- }
- next = zap_pmd_range(tlb, vma, pud, addr, next,
- zap_work, details);
- } while (pud++, addr = next, (addr != end && *zap_work > 0));
+ next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+ } while (pud++, addr = next, addr != end);
return addr;
}
@@ -1061,7 +1266,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
static unsigned long unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
pgd_t *pgd;
unsigned long next;
@@ -1075,29 +1280,19 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd)) {
- (*zap_work)--;
+ if (pgd_none_or_clear_bad(pgd))
continue;
- }
- next = zap_pud_range(tlb, vma, pgd, addr, next,
- zap_work, details);
- } while (pgd++, addr = next, (addr != end && *zap_work > 0));
+ next = zap_pud_range(tlb, vma, pgd, addr, next, details);
+ } while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
mem_cgroup_uncharge_end();
return addr;
}
-#ifdef CONFIG_PREEMPT
-# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
-#else
-/* No preempt: go for improved straight-line efficiency */
-# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
-#endif
-
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
- * @tlbp: address of the caller's struct mmu_gather
+ * @tlb: address of the caller's struct mmu_gather
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
@@ -1108,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
*
* Unmap all pages in the vma list.
*
- * We aim to not hold locks for too long (for scheduling latency reasons).
- * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
- * return the ending mmu_gather to the caller.
- *
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
@@ -1121,17 +1312,12 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
- long zap_work = ZAP_BLOCK_SIZE;
- unsigned long tlb_start = 0; /* For tlb_finish_mmu */
- int tlb_start_valid = 0;
unsigned long start = start_addr;
- spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
- int fullmm = (*tlbp)->fullmm;
struct mm_struct *mm = vma->vm_mm;
mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1152,11 +1338,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
untrack_pfn_vma(vma, 0, 0);
while (start != end) {
- if (!tlb_start_valid) {
- tlb_start = start;
- tlb_start_valid = 1;
- }
-
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
* It is undesirable to test vma->vm_file as it
@@ -1169,39 +1350,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
*/
- if (vma->vm_file) {
+ if (vma->vm_file)
unmap_hugepage_range(vma, start, end, NULL);
- zap_work -= (end - start) /
- pages_per_huge_page(hstate_vma(vma));
- }
start = end;
} else
- start = unmap_page_range(*tlbp, vma,
- start, end, &zap_work, details);
-
- if (zap_work > 0) {
- BUG_ON(start != end);
- break;
- }
-
- tlb_finish_mmu(*tlbp, tlb_start, start);
-
- if (need_resched() ||
- (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
- if (i_mmap_lock) {
- *tlbp = NULL;
- goto out;
- }
- cond_resched();
- }
-
- *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
- tlb_start_valid = 0;
- zap_work = ZAP_BLOCK_SIZE;
+ start = unmap_page_range(tlb, vma, start, end, details);
}
}
-out:
+
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
return start; /* which is now the end (or restart) address */
}
@@ -1217,16 +1374,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mm_struct *mm = vma->vm_mm;
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
unsigned long end = address + size;
unsigned long nr_accounted = 0;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
- if (tlb)
- tlb_finish_mmu(tlb, address, end);
+ tlb_finish_mmu(&tlb, address, end);
return end;
}
@@ -1359,7 +1515,7 @@ split_fallthrough:
*/
mark_page_accessed(page);
}
- if (flags & FOLL_MLOCK) {
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -1410,6 +1566,61 @@ no_page_table:
return page;
}
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ return stack_guard_page_start(vma, addr) ||
+ stack_guard_page_end(vma, addr+PAGE_SIZE);
+}
+
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk: task_struct of target task
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int gup_flags,
struct page **pages, struct vm_area_struct **vmas,
@@ -1437,9 +1648,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
struct vm_area_struct *vma;
vma = find_extend_vma(mm, start);
- if (!vma && in_gate_area(tsk, start)) {
+ if (!vma && in_gate_area(mm, start)) {
unsigned long pg = start & PAGE_MASK;
- struct vm_area_struct *gate_vma = get_gate_vma(tsk);
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
@@ -1464,10 +1674,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pte_unmap(pte);
return i ? : -EFAULT;
}
+ vma = get_gate_vma(mm);
if (pages) {
struct page *page;
- page = vm_normal_page(gate_vma, start, *pte);
+ page = vm_normal_page(vma, start, *pte);
if (!page) {
if (!(gup_flags & FOLL_DUMP) &&
is_zero_pfn(pte_pfn(*pte)))
@@ -1481,12 +1692,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
get_page(page);
}
pte_unmap(pte);
- if (vmas)
- vmas[i] = gate_vma;
- i++;
- start += PAGE_SIZE;
- nr_pages--;
- continue;
+ goto next_page;
}
if (!vma ||
@@ -1516,10 +1722,17 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
int ret;
unsigned int fault_flags = 0;
+ /* For mlock, just skip the stack guard page. */
+ if (foll_flags & FOLL_MLOCK) {
+ if (stack_guard_page(vma, start))
+ goto next_page;
+ }
if (foll_flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (nonblocking)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (foll_flags & FOLL_NOWAIT)
+ fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
ret = handle_mm_fault(mm, vma, start,
fault_flags);
@@ -1527,19 +1740,30 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM;
- if (ret &
- (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
- VM_FAULT_SIGBUS))
+ if (ret & (VM_FAULT_HWPOISON |
+ VM_FAULT_HWPOISON_LARGE)) {
+ if (i)
+ return i;
+ else if (gup_flags & FOLL_HWPOISON)
+ return -EHWPOISON;
+ else
+ return -EFAULT;
+ }
+ if (ret & VM_FAULT_SIGBUS)
return i ? i : -EFAULT;
BUG();
}
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
+
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
if (ret & VM_FAULT_RETRY) {
- *nonblocking = 0;
+ if (nonblocking)
+ *nonblocking = 0;
return i;
}
@@ -1569,6 +1793,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
flush_anon_page(vma, page, start);
flush_dcache_page(page);
}
+next_page:
if (vmas)
vmas[i] = vma;
i++;
@@ -1578,10 +1803,68 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
} while (nr_pages);
return i;
}
+EXPORT_SYMBOL(__get_user_pages);
-/**
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @address: user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software. On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long address, unsigned int fault_flags)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ return -EFAULT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+ return -EHWPOISON;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+ return 0;
+}
+
+/*
* get_user_pages() - pin user pages in memory
- * @tsk: task_struct of target task
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -2115,10 +2398,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
* handle_pte_fault chooses page fault handler according to an entry
* which was read non-atomically. Before making any commitment, on
* those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
* must check under lock before unmapping the pte and proceeding
* (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page and do_no_page can safely check later on).
+ * and do_anonymous_page can safely check later on).
*/
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
@@ -2314,7 +2597,7 @@ reuse:
* bit after it clear all dirty ptes, but before a racing
* do_wp_page installs a dirty pte.
*
- * do_no_page is protected similarly.
+ * __do_fault is protected similarly.
*/
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
@@ -2464,96 +2747,11 @@ unwritable_page:
return ret;
}
-/*
- * Helper functions for unmap_mapping_range().
- *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
- *
- * We have to restart searching the prio_tree whenever we drop the lock,
- * since the iterator is only valid while the lock is held, and anyway
- * a later vma might be split and reinserted earlier while lock dropped.
- *
- * The list of nonlinear vmas could be handled more efficiently, using
- * a placeholder, but handle it in the same way until a need is shown.
- * It is important to search the prio_tree before nonlinear list: a vma
- * may become nonlinear and be shifted from prio_tree to nonlinear list
- * while the lock is dropped; but never shifted from list to prio_tree.
- *
- * In order to make forward progress despite restarting the search,
- * vm_truncate_count is used to mark a vma as now dealt with, so we can
- * quickly skip it next time around. Since the prio_tree search only
- * shows us those vmas affected by unmapping the range in question, we
- * can't efficiently keep all vmas in step with mapping->truncate_count:
- * so instead reset them all whenever it wraps back to 0 (then go to 1).
- * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
- *
- * In order to make forward progress despite repeatedly restarting some
- * large vma, note the restart_addr from unmap_vmas when it breaks out:
- * and restart from that address when we reach that vma again. It might
- * have been split or merged, shrunk or extended, but never shifted: so
- * restart_addr remains valid so long as it remains in the vma's range.
- * unmap_mapping_range forces truncate_count to leap over page-aligned
- * values so we can save vma's restart_addr in its truncate_count field.
- */
-#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
-
-static void reset_vma_truncate_counts(struct address_space *mapping)
-{
- struct vm_area_struct *vma;
- struct prio_tree_iter iter;
-
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
- vma->vm_truncate_count = 0;
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
- vma->vm_truncate_count = 0;
-}
-
-static int unmap_mapping_range_vma(struct vm_area_struct *vma,
+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
{
- unsigned long restart_addr;
- int need_break;
-
- /*
- * files that support invalidating or truncating portions of the
- * file from under mmaped areas must have their ->fault function
- * return a locked page (and set VM_FAULT_LOCKED in the return).
- * This provides synchronisation against concurrent unmapping here.
- */
-
-again:
- restart_addr = vma->vm_truncate_count;
- if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
- start_addr = restart_addr;
- if (start_addr >= end_addr) {
- /* Top of vma has been split off since last time */
- vma->vm_truncate_count = details->truncate_count;
- return 0;
- }
- }
-
- restart_addr = zap_page_range(vma, start_addr,
- end_addr - start_addr, details);
- need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
-
- if (restart_addr >= end_addr) {
- /* We have now completed this vma: mark it so */
- vma->vm_truncate_count = details->truncate_count;
- if (!need_break)
- return 0;
- } else {
- /* Note restart_addr in vma's truncate_count field */
- vma->vm_truncate_count = restart_addr;
- if (!need_break)
- goto again;
- }
-
- spin_unlock(details->i_mmap_lock);
- cond_resched();
- spin_lock(details->i_mmap_lock);
- return -EINTR;
+ zap_page_range(vma, start_addr, end_addr - start_addr, details);
}
static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2563,12 +2761,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
struct prio_tree_iter iter;
pgoff_t vba, vea, zba, zea;
-restart:
vma_prio_tree_foreach(vma, &iter, root,
details->first_index, details->last_index) {
- /* Skip quickly over those we have already dealt with */
- if (vma->vm_truncate_count == details->truncate_count)
- continue;
vba = vma->vm_pgoff;
vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2580,11 +2774,10 @@ restart:
if (zea > vea)
zea = vea;
- if (unmap_mapping_range_vma(vma,
+ unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
- details) < 0)
- goto restart;
+ details);
}
}
@@ -2599,15 +2792,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
* across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point.
*/
-restart:
list_for_each_entry(vma, head, shared.vm_set.list) {
- /* Skip quickly over those we have already dealt with */
- if (vma->vm_truncate_count == details->truncate_count)
- continue;
details->nonlinear_vma = vma;
- if (unmap_mapping_range_vma(vma, vma->vm_start,
- vma->vm_end, details) < 0)
- goto restart;
+ unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
}
}
@@ -2646,53 +2833,17 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
- details.i_mmap_lock = &mapping->i_mmap_lock;
- mutex_lock(&mapping->unmap_mutex);
- spin_lock(&mapping->i_mmap_lock);
-
- /* Protect against endless unmapping loops */
- mapping->truncate_count++;
- if (unlikely(is_restart_addr(mapping->truncate_count))) {
- if (mapping->truncate_count == 0)
- reset_vma_truncate_counts(mapping);
- mapping->truncate_count++;
- }
- details.truncate_count = mapping->truncate_count;
+ mutex_lock(&mapping->i_mmap_mutex);
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
- spin_unlock(&mapping->i_mmap_lock);
- mutex_unlock(&mapping->unmap_mutex);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
EXPORT_SYMBOL(unmap_mapping_range);
-int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
-{
- struct address_space *mapping = inode->i_mapping;
-
- /*
- * If the underlying filesystem is not going to provide
- * a way to truncate a range of blocks (punch a hole) -
- * we should return failure right now.
- */
- if (!inode->i_op->truncate_range)
- return -ENOSYS;
-
- mutex_lock(&inode->i_mutex);
- down_write(&inode->i_alloc_sem);
- unmap_mapping_range(mapping, offset, (end - offset), 1);
- truncate_inode_pages_range(mapping, offset, end);
- unmap_mapping_range(mapping, offset, (end - offset), 1);
- inode->i_op->truncate_range(inode, offset, end);
- up_write(&inode->i_alloc_sem);
- mutex_unlock(&inode->i_mutex);
-
- return 0;
-}
-
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -2707,7 +2858,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t pte;
int locked;
- struct mem_cgroup *ptr = NULL;
+ struct mem_cgroup *ptr;
int exclusive = 0;
int ret = 0;
@@ -2747,6 +2898,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(mm, PGMAJFAULT);
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
@@ -2895,7 +3047,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
if (prev && prev->vm_end == address)
return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
- expand_stack(vma, address - PAGE_SIZE);
+ expand_downwards(vma, address - PAGE_SIZE);
}
if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
struct vm_area_struct *next = vma->vm_next;
@@ -2997,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *page_table;
spinlock_t *ptl;
struct page *page;
+ struct page *cow_page;
pte_t entry;
int anon = 0;
- int charged = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
int page_mkwrite = 0;
+ /*
+ * If we do COW later, allocate page befor taking lock_page()
+ * on the file cache page. This will reduce lock holding time.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
+ cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!cow_page)
+ return VM_FAULT_OOM;
+
+ if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
+ page_cache_release(cow_page);
+ return VM_FAULT_OOM;
+ }
+ } else
+ cow_page = NULL;
+
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
@@ -3013,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY)))
- return ret;
+ goto uncharge_out;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
- return VM_FAULT_HWPOISON;
+ ret = VM_FAULT_HWPOISON;
+ goto uncharge_out;
}
/*
@@ -3036,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
page = vmf.page;
if (flags & FAULT_FLAG_WRITE) {
if (!(vma->vm_flags & VM_SHARED)) {
+ page = cow_page;
anon = 1;
- if (unlikely(anon_vma_prepare(vma))) {
- ret = VM_FAULT_OOM;
- goto out;
- }
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
- vma, address);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
- }
- if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
- ret = VM_FAULT_OOM;
- page_cache_release(page);
- goto out;
- }
- charged = 1;
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
@@ -3121,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, page_table);
} else {
- if (charged)
- mem_cgroup_uncharge_page(page);
+ if (cow_page)
+ mem_cgroup_uncharge_page(cow_page);
if (anon)
page_cache_release(page);
else
@@ -3131,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(page_table, ptl);
-out:
if (dirty_page) {
struct address_space *mapping = page->mapping;
@@ -3161,6 +3318,13 @@ out:
unwritable_page:
page_cache_release(page);
return ret;
+uncharge_out:
+ /* fs's fault handler get error */
+ if (cow_page) {
+ mem_cgroup_uncharge_page(cow_page);
+ page_cache_release(cow_page);
+ }
+ return ret;
}
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -3286,6 +3450,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
+ mem_cgroup_count_vm_event(mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
@@ -3322,7 +3487,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -3439,7 +3604,7 @@ static int __init gate_vma_init(void)
__initcall(gate_vma_init);
#endif
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef AT_SYSINFO_EHDR
return &gate_vma;
@@ -3448,7 +3613,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
#endif
}
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
{
#ifdef AT_SYSINFO_EHDR
if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
@@ -3589,20 +3754,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
#endif
/*
- * Access another process' address space.
- * Source/target buffer must be kernel space,
- * Do not walk the page table directly, use get_user_pages
+ * Access another process' address space as given in mm. If non-NULL, use the
+ * given task for page fault accounting.
*/
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, int write)
{
- struct mm_struct *mm;
struct vm_area_struct *vma;
void *old_buf = buf;
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
-
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was successfully transferred */
while (len) {
@@ -3619,7 +3779,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
*/
#ifdef CONFIG_HAVE_IOREMAP_PROT
vma = find_vma(mm, addr);
- if (!vma)
+ if (!vma || vma->vm_start > addr)
break;
if (vma->vm_ops && vma->vm_ops->access)
ret = vma->vm_ops->access(vma, addr, buf,
@@ -3651,11 +3811,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
addr += bytes;
}
up_read(&mm->mmap_sem);
- mmput(mm);
return buf - old_buf;
}
+/**
+ * access_remote_vm - access another process' address space
+ * @mm: the mm_struct of the target address space
+ * @addr: start address to access
+ * @buf: source or destination buffer
+ * @len: number of bytes to transfer
+ * @write: whether the access is a write
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, int write)
+{
+ return __access_remote_vm(NULL, mm, addr, buf, len, write);
+}
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
+ mmput(mm);
+
+ return ret;
+}
+
/*
* Print the name of a VMA.
*/
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 321fc7455df..6e7d8b21dbf 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,17 @@
#include "internal.h"
+/*
+ * online_page_callback contains pointer to current page onlining function.
+ * Initially it is generic_online_page(). If it is required it could be
+ * changed by calling set_online_page_callback() for callback registration
+ * and restore_online_page_callback() for generic callback restore.
+ */
+
+static void generic_online_page(struct page *page);
+
+static online_page_callback_t online_page_callback = generic_online_page;
+
DEFINE_MUTEX(mem_hotplug_mutex);
void lock_memory_hotplug(void)
@@ -361,27 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
}
EXPORT_SYMBOL_GPL(__remove_pages);
-void online_page(struct page *page)
+int set_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ lock_memory_hotplug();
+
+ if (online_page_callback == generic_online_page) {
+ online_page_callback = callback;
+ rc = 0;
+ }
+
+ unlock_memory_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(set_online_page_callback);
+
+int restore_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ lock_memory_hotplug();
+
+ if (online_page_callback == callback) {
+ online_page_callback = generic_online_page;
+ rc = 0;
+ }
+
+ unlock_memory_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(restore_online_page_callback);
+
+void __online_page_set_limits(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
- totalram_pages++;
if (pfn >= num_physpages)
num_physpages = pfn + 1;
+}
+EXPORT_SYMBOL_GPL(__online_page_set_limits);
+
+void __online_page_increment_counters(struct page *page)
+{
+ totalram_pages++;
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
totalhigh_pages++;
#endif
+}
+EXPORT_SYMBOL_GPL(__online_page_increment_counters);
-#ifdef CONFIG_FLATMEM
- max_mapnr = max(page_to_pfn(page), max_mapnr);
-#endif
-
+void __online_page_free(struct page *page)
+{
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
}
+EXPORT_SYMBOL_GPL(__online_page_free);
+
+static void generic_online_page(struct page *page)
+{
+ __online_page_set_limits(page);
+ __online_page_increment_counters(page);
+ __online_page_free(page);
+}
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
@@ -392,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
if (PageReserved(pfn_to_page(start_pfn)))
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(start_pfn + i);
- online_page(page);
+ (*online_page_callback)(page);
onlined_pages++;
}
*(unsigned long *)arg = onlined_pages;
@@ -400,7 +458,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
}
-int online_pages(unsigned long pfn, unsigned long nr_pages)
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
{
unsigned long onlined_pages = 0;
struct zone *zone;
@@ -459,8 +517,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
zone_pcp_update(zone);
mutex_unlock(&zonelists_mutex);
- setup_per_zone_wmarks();
- calculate_zone_inactive_ratio(zone);
+
+ init_per_zone_wmark_min();
+
if (onlined_pages) {
kswapd_run(zone_to_nid(zone));
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -497,6 +556,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+ /*
+ * The node we allocated has no zone fallback lists. For avoiding
+ * to access not-initialized zonelist, build here.
+ */
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL);
+ mutex_unlock(&zonelists_mutex);
+
return pgdat;
}
@@ -518,7 +585,7 @@ int mem_online_node(int nid)
lock_memory_hotplug();
pgdat = hotadd_new_pgdat(nid, 0);
- if (pgdat) {
+ if (!pgdat) {
ret = -ENOMEM;
goto out;
}
@@ -705,7 +772,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
- if (!page_count(page))
+ if (!get_page_unless_zero(page))
continue;
/*
* We can skip free pages. And we can only deal with pages on
@@ -713,6 +780,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
*/
ret = isolate_lru_page(page);
if (!ret) { /* Success */
+ put_page(page);
list_add_tail(&page->lru, &source);
move_pages--;
inc_zone_page_state(page, NR_ISOLATED_ANON +
@@ -724,7 +792,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
pfn);
dump_page(page);
#endif
- /* Becasue we don't have big zone->lock. we should
+ put_page(page);
+ /* Because we don't have big zone->lock. we should
check this again here. */
if (page_count(page)) {
not_managed++;
@@ -795,7 +864,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
-static int offline_pages(unsigned long start_pfn,
+static int __ref offline_pages(unsigned long start_pfn,
unsigned long end_pfn, unsigned long timeout)
{
unsigned long pfn, nr_pages, expire;
@@ -893,8 +962,8 @@ repeat:
zone->zone_pgdat->node_present_pages -= offlined_pages;
totalram_pages -= offlined_pages;
- setup_per_zone_wmarks();
- calculate_zone_inactive_ratio(zone);
+ init_per_zone_wmark_min();
+
if (!node_present_pages(node)) {
node_clear_state(node, N_HIGH_MEMORY);
kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 49355a970be..8b57173c1dd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,13 +93,13 @@
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
+#include <linux/random.h>
#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
-#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
@@ -457,7 +457,6 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
},
};
-static void gather_stats(struct page *, void *, int pte_dirty);
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
@@ -492,9 +491,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
continue;
- if (flags & MPOL_MF_STATS)
- gather_stats(page, private, pte_dirty(*pte));
- else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, private, flags);
else
break;
@@ -993,7 +990,7 @@ int do_migrate_pages(struct mm_struct *mm,
* most recent <s, d> pair that moved (s != d). If we find a pair
* that not only moved, but what's better, moved to an empty slot
* (d is not set in tmp), then we break out then, with that pair.
- * Otherwise when we finish scannng from_tmp, we at least have the
+ * Otherwise when we finish scanning from_tmp, we at least have the
* most recent <s, d> pair that moved. If we get all the way through
* the scan of tmp without finding any node that moved, much less
* moved to an empty node, then there is nothing left worth migrating.
@@ -1489,7 +1486,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
-static struct mempolicy *get_vma_policy(struct task_struct *task,
+struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
@@ -1524,10 +1521,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
}
/* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+ int nd)
{
- int nd = numa_node_id();
-
switch (policy->mode) {
case MPOL_PREFERRED:
if (!(policy->flags & MPOL_F_LOCAL))
@@ -1650,6 +1646,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
return interleave_nodes(pol);
}
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns -1 if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+ int w, bit = -1;
+
+ w = nodes_weight(*maskp);
+ if (w)
+ bit = bitmap_ord_to_pos(maskp->bits,
+ get_random_int() % w, MAX_NUMNODES);
+ return bit;
+}
+
#ifdef CONFIG_HUGETLBFS
/*
* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
@@ -1679,7 +1690,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
zl = node_zonelist(interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma))), gfp_flags);
} else {
- zl = policy_zonelist(gfp_flags, *mpol);
+ zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
if ((*mpol)->mode == MPOL_BIND)
*nodemask = &(*mpol)->v.nodes;
}
@@ -1820,7 +1831,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
*/
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr)
+ unsigned long addr, int node)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
@@ -1836,7 +1847,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
put_mems_allowed();
return page;
}
- zl = policy_zonelist(gfp, pol);
+ zl = policy_zonelist(gfp, pol, node);
if (unlikely(mpol_needs_cond_ref(pol))) {
/*
* slow path: ref counted shared policy
@@ -1892,7 +1903,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
page = __alloc_pages_nodemask(gfp, order,
- policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+ policy_zonelist(gfp, pol, numa_node_id()),
+ policy_nodemask(gfp, pol));
put_mems_allowed();
return page;
}
@@ -1979,8 +1991,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
return nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
- return a->v.preferred_node == b->v.preferred_node &&
- a->flags == b->flags;
+ return a->v.preferred_node == b->v.preferred_node;
default:
BUG();
return 0;
@@ -2530,159 +2541,3 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
}
return p - buffer;
}
-
-struct numa_maps {
- unsigned long pages;
- unsigned long anon;
- unsigned long active;
- unsigned long writeback;
- unsigned long mapcount_max;
- unsigned long dirty;
- unsigned long swapcache;
- unsigned long node[MAX_NUMNODES];
-};
-
-static void gather_stats(struct page *page, void *private, int pte_dirty)
-{
- struct numa_maps *md = private;
- int count = page_mapcount(page);
-
- md->pages++;
- if (pte_dirty || PageDirty(page))
- md->dirty++;
-
- if (PageSwapCache(page))
- md->swapcache++;
-
- if (PageActive(page) || PageUnevictable(page))
- md->active++;
-
- if (PageWriteback(page))
- md->writeback++;
-
- if (PageAnon(page))
- md->anon++;
-
- if (count > md->mapcount_max)
- md->mapcount_max = count;
-
- md->node[page_to_nid(page)]++;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-static void check_huge_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct numa_maps *md)
-{
- unsigned long addr;
- struct page *page;
- struct hstate *h = hstate_vma(vma);
- unsigned long sz = huge_page_size(h);
-
- for (addr = start; addr < end; addr += sz) {
- pte_t *ptep = huge_pte_offset(vma->vm_mm,
- addr & huge_page_mask(h));
- pte_t pte;
-
- if (!ptep)
- continue;
-
- pte = *ptep;
- if (pte_none(pte))
- continue;
-
- page = pte_page(pte);
- if (!page)
- continue;
-
- gather_stats(page, md, pte_dirty(*ptep));
- }
-}
-#else
-static inline void check_huge_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct numa_maps *md)
-{
-}
-#endif
-
-/*
- * Display pages allocated per node and memory policy via /proc.
- */
-int show_numa_map(struct seq_file *m, void *v)
-{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- struct numa_maps *md;
- struct file *file = vma->vm_file;
- struct mm_struct *mm = vma->vm_mm;
- struct mempolicy *pol;
- int n;
- char buffer[50];
-
- if (!mm)
- return 0;
-
- md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
- if (!md)
- return 0;
-
- pol = get_vma_policy(priv->task, vma, vma->vm_start);
- mpol_to_str(buffer, sizeof(buffer), pol, 0);
- mpol_cond_put(pol);
-
- seq_printf(m, "%08lx %s", vma->vm_start, buffer);
-
- if (file) {
- seq_printf(m, " file=");
- seq_path(m, &file->f_path, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
- seq_printf(m, " heap");
- } else if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
- seq_printf(m, " stack");
- }
-
- if (is_vm_hugetlb_page(vma)) {
- check_huge_range(vma, vma->vm_start, vma->vm_end, md);
- seq_printf(m, " huge");
- } else {
- check_pgd_range(vma, vma->vm_start, vma->vm_end,
- &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
- }
-
- if (!md->pages)
- goto out;
-
- if (md->anon)
- seq_printf(m," anon=%lu",md->anon);
-
- if (md->dirty)
- seq_printf(m," dirty=%lu",md->dirty);
-
- if (md->pages != md->anon && md->pages != md->dirty)
- seq_printf(m, " mapped=%lu", md->pages);
-
- if (md->mapcount_max > 1)
- seq_printf(m, " mapmax=%lu", md->mapcount_max);
-
- if (md->swapcache)
- seq_printf(m," swapcache=%lu", md->swapcache);
-
- if (md->active < md->pages && !is_vm_hugetlb_page(vma))
- seq_printf(m," active=%lu", md->active);
-
- if (md->writeback)
- seq_printf(m," writeback=%lu", md->writeback);
-
- for_each_node_state(n, N_HIGH_MEMORY)
- if (md->node[n])
- seq_printf(m, " N%d=%lu", n, md->node[n]);
-out:
- seq_putc(m, '\n');
- kfree(md);
-
- if (m->count < m->size)
- m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
- return 0;
-}
diff --git a/mm/migrate.c b/mm/migrate.c
index 352de555626..666e4e67741 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
*/
__dec_zone_page_state(page, NR_FILE_PAGES);
__inc_zone_page_state(newpage, NR_FILE_PAGES);
- if (PageSwapBacked(page)) {
+ if (!PageSwapCache(page) && PageSwapBacked(page)) {
__dec_zone_page_state(page, NR_SHMEM);
__inc_zone_page_state(newpage, NR_SHMEM);
}
@@ -375,7 +375,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* redo the accounting that clear_page_dirty_for_io undid,
* but we can't use set_page_dirty because that function
* is actually a signal that all of the page has become dirty.
- * Wheras only part of our page may be dirty.
+ * Whereas only part of our page may be dirty.
*/
__set_page_dirty_nobuffers(newpage);
}
@@ -564,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping,
* == 0 - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int remap_swapcache)
+ int remap_swapcache, bool sync)
{
struct address_space *mapping;
int rc;
@@ -586,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page,
mapping = page_mapping(page);
if (!mapping)
rc = migrate_page(mapping, newpage, page);
- else if (mapping->a_ops->migratepage)
+ else {
/*
- * Most pages have a mapping and most filesystems
- * should provide a migration function. Anonymous
- * pages are part of swap space which also has its
- * own migration function. This is the most common
- * path for page migration.
+ * Do not writeback pages if !sync and migratepage is
+ * not pointing to migrate_page() which is nonblocking
+ * (swapcache/tmpfs uses migratepage = migrate_page).
*/
- rc = mapping->a_ops->migratepage(mapping,
- newpage, page);
- else
- rc = fallback_migrate_page(mapping, newpage, page);
+ if (PageDirty(page) && !sync &&
+ mapping->a_ops->migratepage != migrate_page)
+ rc = -EBUSY;
+ else if (mapping->a_ops->migratepage)
+ /*
+ * Most pages have a mapping and most filesystems
+ * should provide a migration function. Anonymous
+ * pages are part of swap space which also has its
+ * own migration function. This is the most common
+ * path for page migration.
+ */
+ rc = mapping->a_ops->migratepage(mapping,
+ newpage, page);
+ else
+ rc = fallback_migrate_page(mapping, newpage, page);
+ }
if (rc) {
newpage->mapping = NULL;
@@ -623,7 +633,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
struct page *newpage = get_new_page(page, private, &result);
int remap_swapcache = 1;
int charge = 0;
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
if (!newpage)
@@ -641,7 +651,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
rc = -EAGAIN;
if (!trylock_page(page)) {
- if (!force)
+ if (!force || !sync)
goto move_newpage;
/*
@@ -678,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
}
/* charge against new page */
- charge = mem_cgroup_prepare_migration(page, newpage, &mem);
+ charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
if (charge == -ENOMEM) {
rc = -ENOMEM;
goto unlock;
@@ -686,7 +696,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
BUG_ON(charge);
if (PageWriteback(page)) {
- if (!force || !sync)
+ /*
+ * For !sync, there is no point retrying as the retry loop
+ * is expected to be too short for PageWriteback to be cleared
+ */
+ if (!sync) {
+ rc = -EBUSY;
+ goto uncharge;
+ }
+ if (!force)
goto uncharge;
wait_on_page_writeback(page);
}
@@ -703,15 +721,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* Only page_lock_anon_vma() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
*/
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_get_anon_vma(page);
if (anon_vma) {
/*
- * Take a reference count on the anon_vma if the
- * page is mapped so that it is guaranteed to
- * exist when the page is remapped later
+ * Anon page
*/
- get_anon_vma(anon_vma);
- page_unlock_anon_vma(anon_vma);
} else if (PageSwapCache(page)) {
/*
* We cannot be sure that the anon_vma of an unmapped
@@ -757,14 +771,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, remap_swapcache);
+ rc = move_to_new_page(newpage, page, remap_swapcache, sync);
if (rc && remap_swapcache)
remove_migration_ptes(page, page);
/* Drop an anon_vma reference if we took one */
if (anon_vma)
- drop_anon_vma(anon_vma);
+ put_anon_vma(anon_vma);
uncharge:
if (!charge)
@@ -839,24 +853,19 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
lock_page(hpage);
}
- if (PageAnon(hpage)) {
- anon_vma = page_lock_anon_vma(hpage);
- if (anon_vma) {
- get_anon_vma(anon_vma);
- page_unlock_anon_vma(anon_vma);
- }
- }
+ if (PageAnon(hpage))
+ anon_vma = page_get_anon_vma(hpage);
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, 1);
+ rc = move_to_new_page(new_hpage, hpage, 1, sync);
if (rc)
remove_migration_ptes(hpage, hpage);
if (anon_vma)
- drop_anon_vma(anon_vma);
+ put_anon_vma(anon_vma);
out:
unlock_page(hpage);
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d75c7..636a86876ff 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* file will not get a swp_entry_t in its pte, but rather it is like
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
- *
- * However when tmpfs moves the page from pagecache and into swapcache,
- * it is still in core, but the find_get_page below won't find it.
- * No big deal, but make a note of it.
*/
page = find_get_page(mapping, pgoff);
+#ifdef CONFIG_SWAP
+ /* shmem/tmpfs may return swap: account for swapcache page too. */
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swap = radix_to_swp_entry(page);
+ page = find_get_page(&swapper_space, swap.val);
+ }
+#endif
if (page) {
present = PageUptodate(page);
page_cache_release(page);
diff --git a/mm/mlock.c b/mm/mlock.c
index c3924c7f00b..048260c4e02 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page)
}
}
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
- return (vma->vm_flags & VM_GROWSDOWN) &&
- (vma->vm_start == addr) &&
- !vma_stack_continue(vma->vm_prev, addr);
-}
-
/**
* __mlock_vma_pages_range() - mlock a range of pages in the vma.
* @vma: target vma
@@ -169,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
VM_BUG_ON(end > vma->vm_end);
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
- gup_flags = FOLL_TOUCH;
+ gup_flags = FOLL_TOUCH | FOLL_MLOCK;
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
@@ -185,15 +178,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
gup_flags |= FOLL_FORCE;
- if (vma->vm_flags & VM_LOCKED)
- gup_flags |= FOLL_MLOCK;
-
- /* We don't try to access the guard page of a stack vma */
- if (stack_guard_page(vma, start)) {
- addr += PAGE_SIZE;
- nr_pages--;
- }
-
return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
NULL, NULL, nonblocking);
}
@@ -237,7 +221,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current))) {
+ vma == get_gate_vma(current->mm))) {
__mlock_vma_pages_range(vma, start, end, NULL);
@@ -323,16 +307,16 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
* For vmas that pass the filters, merge/split as appropriate.
*/
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end, unsigned int newflags)
+ unsigned long start, unsigned long end, vm_flags_t newflags)
{
struct mm_struct *mm = vma->vm_mm;
pgoff_t pgoff;
int nr_pages;
int ret = 0;
- int lock = newflags & VM_LOCKED;
+ int lock = !!(newflags & VM_LOCKED);
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
- is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
goto out; /* don't set VM_LOCKED, don't count */
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -401,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
prev = vma;
for (nstart = start ; ; ) {
- unsigned int newflags;
+ vm_flags_t newflags;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
@@ -540,7 +524,7 @@ static int do_mlockall(int flags)
goto out;
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
- unsigned int newflags;
+ vm_flags_t newflags;
newflags = vma->vm_flags | VM_LOCKED;
if (!(flags & MCL_CURRENT))
diff --git a/mm/mmap.c b/mm/mmap.c
index 2ec8eb5a9cd..a65efd4db3e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -84,10 +84,14 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
}
EXPORT_SYMBOL(vm_get_page_prot);
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50; /* default is 50% */
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
+int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-struct percpu_counter vm_committed_as;
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
/*
* Check that a process has enough memory to allocate a new virtual
@@ -118,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
- free = global_page_state(NR_FILE_PAGES);
free += nr_swap_pages;
/*
@@ -132,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
- if (n <= totalreserve_pages)
+ if (free <= totalreserve_pages)
goto error;
else
- n -= totalreserve_pages;
+ free -= totalreserve_pages;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
- n -= n / 32;
- free += n;
+ free -= free / 32;
if (free > pages)
return 0;
@@ -190,7 +186,7 @@ error:
}
/*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_mutex
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
@@ -218,9 +214,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
if (file) {
struct address_space *mapping = file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
__remove_shared_vm_struct(vma, file, mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
}
@@ -259,7 +255,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
* randomize_va_space to 2, which will still cause mm->start_brk
* to be arbitrarily shifted
*/
- if (mm->start_brk > PAGE_ALIGN(mm->end_data))
+ if (current->brk_randomized)
min_brk = mm->start_brk;
else
min_brk = mm->end_data;
@@ -394,29 +390,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
return vma;
}
-static inline void
-__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent)
-{
- struct vm_area_struct *next;
-
- vma->vm_prev = prev;
- if (prev) {
- next = prev->vm_next;
- prev->vm_next = vma;
- } else {
- mm->mmap = vma;
- if (rb_parent)
- next = rb_entry(rb_parent,
- struct vm_area_struct, vm_rb);
- else
- next = NULL;
- }
- vma->vm_next = next;
- if (next)
- next->vm_prev = vma;
-}
-
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
@@ -464,16 +437,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_file)
mapping = vma->vm_file->f_mapping;
- if (mapping) {
- spin_lock(&mapping->i_mmap_lock);
- vma->vm_truncate_count = mapping->truncate_count;
- }
+ if (mapping)
+ mutex_lock(&mapping->i_mmap_mutex);
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
mm->map_count++;
validate_mm(mm);
@@ -576,17 +547,8 @@ again: remove_next = 1 + (end > next->vm_end);
mapping = file->f_mapping;
if (!(vma->vm_flags & VM_NONLINEAR))
root = &mapping->i_mmap;
- spin_lock(&mapping->i_mmap_lock);
- if (importer &&
- vma->vm_truncate_count != next->vm_truncate_count) {
- /*
- * unmap_mapping_range might be in progress:
- * ensure that the expanding vma is rescanned.
- */
- importer->vm_truncate_count = 0;
- }
+ mutex_lock(&mapping->i_mmap_mutex);
if (insert) {
- insert->vm_truncate_count = vma->vm_truncate_count;
/*
* Put into prio_tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
@@ -605,7 +567,7 @@ again: remove_next = 1 + (end > next->vm_end);
* lock may be shared between many sibling processes. Skipping
* the lock for brk adjustments makes a difference sometimes.
*/
- if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+ if (vma->anon_vma && (importer || start != vma->vm_start)) {
anon_vma = vma->anon_vma;
anon_vma_lock(anon_vma);
}
@@ -652,7 +614,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (anon_vma)
anon_vma_unlock(anon_vma);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
if (remove_next) {
if (file) {
@@ -699,9 +661,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
}
static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2)
+ struct anon_vma *anon_vma2,
+ struct vm_area_struct *vma)
{
- return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ if ((!anon_vma1 || !anon_vma2) && (!vma ||
+ list_is_singular(&vma->anon_vma_chain)))
+ return 1;
+ return anon_vma1 == anon_vma2;
}
/*
@@ -720,7 +690,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
{
if (is_mergeable_vma(vma, file, vm_flags) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
}
@@ -739,7 +709,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
{
if (is_mergeable_vma(vma, file, vm_flags) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
@@ -817,7 +787,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen) &&
is_mergeable_anon_vma(prev->anon_vma,
- next->anon_vma)) {
+ next->anon_vma, NULL)) {
/* cases 1, 6 */
err = vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL);
@@ -928,14 +898,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
if (anon_vma)
return anon_vma;
try_prev:
- /*
- * It is potentially slow to have to call find_vma_prev here.
- * But it's only on the first write fault on the vma, not
- * every time, and we could devise a way to avoid it later
- * (e.g. stash info in next's anon_vma_node when assigning
- * an anon_vma, or when trying vma_merge). Another time.
- */
- BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
+ near = vma->vm_prev;
if (!near)
goto none;
@@ -982,7 +945,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
{
struct mm_struct * mm = current->mm;
struct inode *inode;
- unsigned int vm_flags;
+ vm_flags_t vm_flags;
int error;
unsigned long reqprot = prot;
@@ -1187,7 +1150,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
*/
int vma_wants_writenotify(struct vm_area_struct *vma)
{
- unsigned int vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
/* If it was private or non-writable, the write bit is already clear */
if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1215,7 +1178,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
* We account for memory if it's a private writeable mapping,
* not hugepages and VM_NORESERVE wasn't set.
*/
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
/*
* hugetlb has its own accounting separate from the core VM
@@ -1229,7 +1192,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags,
- unsigned int vm_flags, unsigned long pgoff)
+ vm_flags_t vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
@@ -1767,10 +1730,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
size = address - vma->vm_start;
grow = (address - vma->vm_end) >> PAGE_SHIFT;
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- vma->vm_end = address;
- perf_event_mmap(vma);
+ error = -ENOMEM;
+ if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ vma->vm_end = address;
+ perf_event_mmap(vma);
+ }
}
}
vma_unlock_anon_vma(vma);
@@ -1782,7 +1748,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-static int expand_downwards(struct vm_area_struct *vma,
+int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
int error;
@@ -1814,11 +1780,14 @@ static int expand_downwards(struct vm_area_struct *vma,
size = vma->vm_end - address;
grow = (vma->vm_start - address) >> PAGE_SHIFT;
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
- perf_event_mmap(vma);
+ error = -ENOMEM;
+ if (grow <= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ vma->vm_start = address;
+ vma->vm_pgoff -= grow;
+ perf_event_mmap(vma);
+ }
}
}
vma_unlock_anon_vma(vma);
@@ -1826,11 +1795,6 @@ static int expand_downwards(struct vm_area_struct *vma,
return error;
}
-int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
-{
- return expand_downwards(vma, address);
-}
-
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
@@ -1913,17 +1877,17 @@ static void unmap_region(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
unsigned long nr_accounted = 0;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
- next? next->vm_start: 0);
- tlb_finish_mmu(tlb, start, end);
+ free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : 0);
+ tlb_finish_mmu(&tlb, start, end);
}
/*
@@ -2065,9 +2029,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return -EINVAL;
/* Find the first overlapping VMA */
- vma = find_vma_prev(mm, start, &prev);
+ vma = find_vma(mm, start);
if (!vma)
return 0;
+ prev = vma->vm_prev;
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
@@ -2265,7 +2230,7 @@ EXPORT_SYMBOL(do_brk);
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
unsigned long end;
@@ -2290,14 +2255,14 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
- tlb = tlb_gather_mmu(mm, 1);
+ tlb_gather_mmu(&tlb, mm, 1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
- tlb_finish_mmu(tlb, 0, end);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ tlb_finish_mmu(&tlb, 0, end);
/*
* Walk the list again, actually closing and freeing it,
@@ -2311,7 +2276,7 @@ void exit_mmap(struct mm_struct *mm)
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_mutex is taken here.
*/
int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
@@ -2523,15 +2488,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
+ mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
- * anon_vma->root->lock. If some other vma in this mm shares
+ * anon_vma->root->mutex. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->root->lock.
+ * anon_vma->root->mutex.
*/
if (__test_and_set_bit(0, (unsigned long *)
&anon_vma->root->head.next))
@@ -2553,7 +2518,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+ mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
}
}
@@ -2580,7 +2545,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* vma in this mm is backed by the same anon_vma or address_space.
*
* We can take all the locks in random order because the VM code
- * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
* takes more than one of them in a row. Secondly we're protected
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
*
@@ -2636,7 +2601,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->root->lock.
+ * anon_vma->root->mutex.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->head.next))
@@ -2652,7 +2617,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
* AS_MM_ALL_LOCKS can't change to 0 from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
&mapping->flags))
BUG();
diff --git a/mm/mremap.c b/mm/mremap.c
index 1de98d492dd..506fa44403d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -93,8 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* and we propagate stale pages into the dst afterward.
*/
mapping = vma->vm_file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
- new_vma->vm_truncate_count = 0;
+ mutex_lock(&mapping->i_mmap_mutex);
}
/*
@@ -123,7 +122,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
}
@@ -277,9 +276,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (old_len > vma->vm_end - addr)
goto Efault;
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
- if (new_len > old_len)
+ /* Need to be careful about a growing mapping */
+ if (new_len > old_len) {
+ unsigned long pgoff;
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
goto Efault;
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+ goto Einval;
}
if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
new file mode 100644
index 00000000000..6e93dc7f258
--- /dev/null
+++ b/mm/nobootmem.c
@@ -0,0 +1,404 @@
+/*
+ * bootmem - A boot-time physical memory allocator and configurator
+ *
+ * Copyright (C) 1999 Ingo Molnar
+ * 1999 Kanoj Sarcar, SGI
+ * 2008 Johannes Weiner
+ *
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
+ */
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include <linux/range.h>
+#include <linux/memblock.h>
+
+#include <asm/bug.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+
+#include "internal.h"
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data;
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ void *ptr;
+ u64 addr;
+
+ if (limit > memblock.current_limit)
+ limit = memblock.current_limit;
+
+ addr = find_memory_core_early(nid, size, align, goal, limit);
+
+ if (addr == MEMBLOCK_ERROR)
+ return NULL;
+
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+ return ptr;
+}
+
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+ unsigned long cursor, end;
+
+ kmemleak_free_part(__va(addr), size);
+
+ cursor = PFN_UP(addr);
+ end = PFN_DOWN(addr + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
+
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+ int i;
+ unsigned long start_aligned, end_aligned;
+ int order = ilog2(BITS_PER_LONG);
+
+ start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+ end_aligned = end & ~(BITS_PER_LONG - 1);
+
+ if (end_aligned <= start_aligned) {
+ for (i = start; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ return;
+ }
+
+ for (i = start; i < start_aligned; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+ __free_pages_bootmem(pfn_to_page(i), order);
+
+ for (i = end_aligned; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+}
+
+unsigned long __init free_all_memory_core_early(int nodeid)
+{
+ int i;
+ u64 start, end;
+ unsigned long count = 0;
+ struct range *range = NULL;
+ int nr_range;
+
+ nr_range = get_free_all_memory_range(&range, nodeid);
+
+ for (i = 0; i < nr_range; i++) {
+ start = range[i].start;
+ end = range[i].end;
+ count += end - start;
+ __free_pages_memory(start, end);
+ }
+
+ return count;
+}
+
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+{
+ register_page_bootmem_info_node(pgdat);
+
+ /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+ return 0;
+}
+
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem(void)
+{
+ /*
+ * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+ * because in some case like Node0 doesn't have RAM installed
+ * low ram will be on Node1
+ * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
+ * will be used instead of only Node0 related
+ */
+ return free_all_memory_core_early(MAX_NUMNODES);
+}
+
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size)
+{
+ kmemleak_free_part(__va(physaddr), size);
+ memblock_x86_free_range(physaddr, physaddr + size);
+}
+
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
+{
+ kmemleak_free_part(__va(addr), size);
+ memblock_x86_free_range(addr, addr + size);
+}
+
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
+restart:
+
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+
+ if (ptr)
+ return ptr;
+
+ if (goal != 0) {
+ goal = 0;
+ goto restart;
+ }
+
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = -1UL;
+
+ return ___alloc_bootmem_nopanic(size, align, goal, limit);
+}
+
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+
+ if (mem)
+ return mem;
+ /*
+ * Whoops, we cannot satisfy the allocation request.
+ */
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = -1UL;
+
+ return ___alloc_bootmem(size, align, goal, limit);
+}
+
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+ if (ptr)
+ return ptr;
+
+ return __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, -1ULL);
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ return __alloc_bootmem_node(pgdat, size, align, goal);
+}
+
+#ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
+void * __init alloc_bootmem_section(unsigned long size,
+ unsigned long section_nr)
+{
+ unsigned long pfn, goal, limit;
+
+ pfn = section_nr_to_pfn(section_nr);
+ goal = pfn << PAGE_SHIFT;
+ limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+
+ return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
+ SMP_CACHE_BYTES, goal, limit);
+}
+#endif
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+ if (ptr)
+ return ptr;
+
+ return __alloc_bootmem_nopanic(size, align, goal);
+}
+
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
+#endif
+
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+}
+
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+ if (ptr)
+ return ptr;
+
+ return __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index f59e1424d3d..4358032566e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/tracehook.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
@@ -680,9 +679,9 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *pvma, **pp, *next;
+ struct vm_area_struct *pvma, *prev;
struct address_space *mapping;
- struct rb_node **p, *parent;
+ struct rb_node **p, *parent, *rb_prev;
kenter(",%p", vma);
@@ -703,7 +702,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
}
/* add the VMA to the tree */
- parent = NULL;
+ parent = rb_prev = NULL;
p = &mm->mm_rb.rb_node;
while (*p) {
parent = *p;
@@ -713,17 +712,20 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
* (the latter is necessary as we may get identical VMAs) */
if (vma->vm_start < pvma->vm_start)
p = &(*p)->rb_left;
- else if (vma->vm_start > pvma->vm_start)
+ else if (vma->vm_start > pvma->vm_start) {
+ rb_prev = parent;
p = &(*p)->rb_right;
- else if (vma->vm_end < pvma->vm_end)
+ } else if (vma->vm_end < pvma->vm_end)
p = &(*p)->rb_left;
- else if (vma->vm_end > pvma->vm_end)
+ else if (vma->vm_end > pvma->vm_end) {
+ rb_prev = parent;
p = &(*p)->rb_right;
- else if (vma < pvma)
+ } else if (vma < pvma)
p = &(*p)->rb_left;
- else if (vma > pvma)
+ else if (vma > pvma) {
+ rb_prev = parent;
p = &(*p)->rb_right;
- else
+ } else
BUG();
}
@@ -731,20 +733,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
/* add VMA to the VMA list also */
- for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
- if (pvma->vm_start > vma->vm_start)
- break;
- if (pvma->vm_start < vma->vm_start)
- continue;
- if (pvma->vm_end < vma->vm_end)
- break;
- }
+ prev = NULL;
+ if (rb_prev)
+ prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- next = *pp;
- *pp = vma;
- vma->vm_next = next;
- if (next)
- next->vm_prev = vma;
+ __vma_link_list(mm, vma, prev, parent);
}
/*
@@ -752,7 +745,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
*/
static void delete_vma_from_mm(struct vm_area_struct *vma)
{
- struct vm_area_struct **pp;
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
@@ -775,12 +767,14 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
/* remove from the MM's tree and list */
rb_erase(&vma->vm_rb, &mm->mm_rb);
- for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
- if (*pp == vma) {
- *pp = vma->vm_next;
- break;
- }
- }
+
+ if (vma->vm_prev)
+ vma->vm_prev->vm_next = vma->vm_next;
+ else
+ mm->mmap = vma->vm_next;
+
+ if (vma->vm_next)
+ vma->vm_next->vm_prev = vma->vm_prev;
vma->vm_mm = NULL;
}
@@ -809,17 +803,15 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma;
- struct rb_node *n = mm->mm_rb.rb_node;
/* check the cache first */
vma = mm->mmap_cache;
if (vma && vma->vm_start <= addr && vma->vm_end > addr)
return vma;
- /* trawl the tree (there may be multiple mappings in which addr
+ /* trawl the list (there may be multiple mappings in which addr
* resides) */
- for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
- vma = rb_entry(n, struct vm_area_struct, vm_rb);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end > addr) {
@@ -859,7 +851,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long len)
{
struct vm_area_struct *vma;
- struct rb_node *n = mm->mm_rb.rb_node;
unsigned long end = addr + len;
/* check the cache first */
@@ -867,10 +858,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
if (vma && vma->vm_start == addr && vma->vm_end == end)
return vma;
- /* trawl the tree (there may be multiple mappings in which addr
+ /* trawl the list (there may be multiple mappings in which addr
* resides) */
- for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
- vma = rb_entry(n, struct vm_area_struct, vm_rb);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->vm_start < addr)
continue;
if (vma->vm_start > addr)
@@ -1096,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
* it's being traced - otherwise breakpoints set in it may interfere
* with another untraced process
*/
- if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
+ if ((flags & MAP_PRIVATE) && current->ptrace)
vm_flags &= ~VM_MAYSHARE;
return vm_flags;
@@ -1133,7 +1123,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
unsigned long capabilities)
{
struct page *pages;
- unsigned long total, point, n, rlen;
+ unsigned long total, point, n;
void *base;
int ret, order;
@@ -1157,13 +1147,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
* make a private copy of the data and map that instead */
}
- rlen = PAGE_ALIGN(len);
/* allocate some memory to hold the mapping
* - note that this may not return a page-aligned address if the object
* we're allocating is smaller than a page
*/
- order = get_order(rlen);
+ order = get_order(len);
kdebug("alloc order %d for %lx", order, len);
pages = alloc_pages(GFP_KERNEL, order);
@@ -1173,7 +1162,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
total = 1 << order;
atomic_long_add(total, &mmap_pages_allocated);
- point = rlen >> PAGE_SHIFT;
+ point = len >> PAGE_SHIFT;
/* we allocated a power-of-2 sized page set, so we may want to trim off
* the excess */
@@ -1195,7 +1184,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
base = page_address(pages);
region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
region->vm_start = (unsigned long) base;
- region->vm_end = region->vm_start + rlen;
+ region->vm_end = region->vm_start + len;
region->vm_top = region->vm_start + (total << PAGE_SHIFT);
vma->vm_start = region->vm_start;
@@ -1211,22 +1200,22 @@ static int do_mmap_private(struct vm_area_struct *vma,
old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
+ ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
set_fs(old_fs);
if (ret < 0)
goto error_free;
/* clear the last little bit */
- if (ret < rlen)
- memset(base + ret, 0, rlen - ret);
+ if (ret < len)
+ memset(base + ret, 0, len - ret);
}
return 0;
error_free:
- free_page_series(region->vm_start, region->vm_end);
+ free_page_series(region->vm_start, region->vm_top);
region->vm_start = vma->vm_start = 0;
region->vm_end = vma->vm_end = 0;
region->vm_top = 0;
@@ -1235,7 +1224,7 @@ error_free:
enomem:
printk("Allocation of length %lu from process %d (%s) failed\n",
len, current->pid, current->comm);
- show_free_areas();
+ show_free_areas(0);
return -ENOMEM;
}
@@ -1268,6 +1257,7 @@ unsigned long do_mmap_pgoff(struct file *file,
/* we ignore the address hint */
addr = 0;
+ len = PAGE_ALIGN(len);
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
@@ -1385,15 +1375,15 @@ unsigned long do_mmap_pgoff(struct file *file,
if (capabilities & BDI_CAP_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
- if (IS_ERR((void *) addr)) {
+ if (IS_ERR_VALUE(addr)) {
ret = addr;
- if (ret != (unsigned long) -ENOSYS)
+ if (ret != -ENOSYS)
goto error_just_free;
/* the driver refused to tell us where to site
* the mapping so we'll have to attempt to copy
* it */
- ret = (unsigned long) -ENODEV;
+ ret = -ENODEV;
if (!(capabilities & BDI_CAP_MAP_COPY))
goto error_just_free;
@@ -1468,14 +1458,14 @@ error_getting_vma:
printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
" from process %d failed\n",
len, current->pid);
- show_free_areas();
+ show_free_areas(0);
return -ENOMEM;
error_getting_region:
printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
" from process %d failed\n",
len, current->pid);
- show_free_areas();
+ show_free_areas(0);
return -ENOMEM;
}
EXPORT_SYMBOL(do_mmap_pgoff);
@@ -1644,15 +1634,17 @@ static int shrink_vma(struct mm_struct *mm,
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
{
struct vm_area_struct *vma;
- struct rb_node *rb;
- unsigned long end = start + len;
+ unsigned long end;
int ret;
kenter(",%lx,%zx", start, len);
+ len = PAGE_ALIGN(len);
if (len == 0)
return -EINVAL;
+ end = start + len;
+
/* find the first potentially overlapping VMA */
vma = find_vma(mm, start);
if (!vma) {
@@ -1677,9 +1669,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
}
if (end == vma->vm_end)
goto erase_whole_vma;
- rb = rb_next(&vma->vm_rb);
- vma = rb_entry(rb, struct vm_area_struct, vm_rb);
- } while (rb);
+ vma = vma->vm_next;
+ } while (vma);
kleave(" = -EINVAL [split file]");
return -EINVAL;
} else {
@@ -1773,6 +1764,8 @@ unsigned long do_mremap(unsigned long addr,
struct vm_area_struct *vma;
/* insanity checks first */
+ old_len = PAGE_ALIGN(old_len);
+ new_len = PAGE_ALIGN(new_len);
if (old_len == 0 || new_len == 0)
return (unsigned long) -EINVAL;
@@ -1819,10 +1812,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
return NULL;
}
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
- unsigned long to, unsigned long size, pgprot_t prot)
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
- vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+ if (addr != (pfn << PAGE_SHIFT))
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1842,10 +1838,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-
unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
@@ -1892,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
- free = global_page_state(NR_FILE_PAGES);
free += nr_swap_pages;
/*
@@ -1906,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
- if (n <= totalreserve_pages)
+ if (free <= totalreserve_pages)
goto error;
else
- n -= totalreserve_pages;
+ free -= totalreserve_pages;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
- n -= n / 32;
- free += n;
+ free -= free / 32;
if (free > pages)
return 0;
@@ -1963,7 +1947,7 @@ error:
return -ENOMEM;
}
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
{
return 0;
}
@@ -1975,21 +1959,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-/*
- * Access another process' address space.
- * - source/target buffer must be kernel space
- */
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, int write)
{
struct vm_area_struct *vma;
- struct mm_struct *mm;
-
- if (addr + len < addr)
- return 0;
-
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
down_read(&mm->mmap_sem);
@@ -2014,6 +1987,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
}
up_read(&mm->mmap_sem);
+
+ return len;
+}
+
+/**
+ * @access_remote_vm - access another process' address space
+ * @mm: the mm_struct of the target address space
+ * @addr: start address to access
+ * @buf: source or destination buffer
+ * @len: number of bytes to transfer
+ * @write: whether the access is a write
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, int write)
+{
+ return __access_remote_vm(NULL, mm, addr, buf, len, write);
+}
+
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+
+ if (addr + len < addr)
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ len = __access_remote_vm(tsk, mm, addr, buf, len, write);
+
mmput(mm);
return len;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7dcca55ede7..626303b52f3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,12 +31,40 @@
#include <linux/memcontrol.h>
#include <linux/mempolicy.h>
#include <linux/security.h>
+#include <linux/ptrace.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
+/**
+ * test_set_oom_score_adj() - set current's oom_score_adj and return old value
+ * @new_val: new oom_score_adj value
+ *
+ * Sets the oom_score_adj value for current to @new_val with proper
+ * synchronization and returns the old value. Usually used to temporarily
+ * set a value, save the old value in the caller, and then reinstate it later.
+ */
+int test_set_oom_score_adj(int new_val)
+{
+ struct sighand_struct *sighand = current->sighand;
+ int old_val;
+
+ spin_lock_irq(&sighand->siglock);
+ old_val = current->signal->oom_score_adj;
+ if (new_val != old_val) {
+ if (new_val == OOM_SCORE_ADJ_MIN)
+ atomic_inc(&current->mm->oom_disable_count);
+ else if (old_val == OOM_SCORE_ADJ_MIN)
+ atomic_dec(&current->mm->oom_disable_count);
+ current->signal->oom_score_adj = new_val;
+ }
+ spin_unlock_irq(&sighand->siglock);
+
+ return old_val;
+}
+
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
@@ -83,24 +111,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
#endif /* CONFIG_NUMA */
/*
- * If this is a system OOM (not a memcg OOM) and the task selected to be
- * killed is not already running at high (RT) priorities, speed up the
- * recovery by boosting the dying task to the lowest FIFO priority.
- * That helps with the recovery and avoids interfering with RT tasks.
- */
-static void boost_dying_task_prio(struct task_struct *p,
- struct mem_cgroup *mem)
-{
- struct sched_param param = { .sched_priority = 1 };
-
- if (mem)
- return;
-
- if (!rt_task(p))
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-}
-
-/*
* The process p may have detached its own ->mm while exiting or through
* use_mm(), but one or more of its subthreads may still have a valid
* pointer. Return p, or any of its subthreads with a valid ->mm, with
@@ -172,15 +182,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
}
/*
- * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
- * priority for oom killing.
- */
- if (p->flags & PF_OOM_ORIGIN) {
- task_unlock(p);
- return 1000;
- }
-
- /*
* The memory controller may have a limit of 0 bytes, so avoid a divide
* by zero, if necessary.
*/
@@ -189,10 +190,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
/*
* The baseline for the badness score is the proportion of RAM that each
- * task's rss and swap space use.
+ * task's rss, pagetable and swap space use.
*/
- points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
- totalpages;
+ points = get_mm_rss(p->mm) + p->mm->nr_ptes;
+ points += get_mm_counter(p->mm, MM_SWAPENTS);
+
+ points *= 1000;
+ points /= totalpages;
task_unlock(p);
/*
@@ -292,13 +296,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long totalpages, struct mem_cgroup *mem,
const nodemask_t *nodemask)
{
- struct task_struct *p;
+ struct task_struct *g, *p;
struct task_struct *chosen = NULL;
*ppoints = 0;
- for_each_process(p) {
+ do_each_thread(g, p) {
unsigned int points;
+ if (p->exit_state)
+ continue;
if (oom_unkillable_task(p, mem, nodemask))
continue;
@@ -313,23 +319,31 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
*/
if (test_tsk_thread_flag(p, TIF_MEMDIE))
return ERR_PTR(-1UL);
+ if (!p->mm)
+ continue;
- /*
- * This is in the process of releasing memory so wait for it
- * to finish before killing some other task by mistake.
- *
- * However, if p is the current task, we allow the 'kill' to
- * go ahead if it is exiting: this will simply set TIF_MEMDIE,
- * which will allow it to gain access to memory reserves in
- * the process of exiting and releasing its resources.
- * Otherwise we could get an easy OOM deadlock.
- */
- if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
- if (p != current)
- return ERR_PTR(-1UL);
-
- chosen = p;
- *ppoints = 1000;
+ if (p->flags & PF_EXITING) {
+ /*
+ * If p is the current task and is in the process of
+ * releasing memory, we allow the "kill" to set
+ * TIF_MEMDIE, which will allow it to gain access to
+ * memory reserves. Otherwise, it may stall forever.
+ *
+ * The loop isn't broken here, however, in case other
+ * threads are found to have already been oom killed.
+ */
+ if (p == current) {
+ chosen = p;
+ *ppoints = 1000;
+ } else {
+ /*
+ * If this task is not being ptraced on exit,
+ * then wait for it to finish before killing
+ * some other task unnecessarily.
+ */
+ if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
+ return ERR_PTR(-1UL);
+ }
}
points = oom_badness(p, mem, nodemask, totalpages);
@@ -337,7 +351,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
chosen = p;
*ppoints = points;
}
- }
+ } while_each_thread(g, p);
return chosen;
}
@@ -396,7 +410,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
task_unlock(current);
dump_stack();
mem_cgroup_print_oom_info(mem, p);
- show_mem();
+ show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
dump_tasks(mem, nodemask);
}
@@ -442,13 +456,6 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
- /*
- * We give our sacrificial lamb high priority and access to
- * all the memory it needs. That way it should be able to
- * exit() and clear out its resources quickly...
- */
- boost_dying_task_prio(p, mem);
-
return 0;
}
#undef K
@@ -472,7 +479,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
*/
if (p->flags & PF_EXITING) {
set_tsk_thread_flag(p, TIF_MEMDIE);
- boost_dying_task_prio(p, mem);
return 0;
}
@@ -483,7 +489,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* If any of p's children has a different mm and is eligible for kill,
- * the one with the highest badness() score is sacrificed for its
+ * the one with the highest oom_badness() score is sacrificed for its
* parent. This attempts to lose the minimal amount of work done while
* still freeing memory.
*/
@@ -491,6 +497,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
+ if (child->mm == p->mm)
+ continue;
/*
* oom_badness() returns 0 if the thread is unkillable
*/
@@ -537,6 +545,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
unsigned int points = 0;
struct task_struct *p;
+ /*
+ * If current has a pending SIGKILL, then automatically select it. The
+ * goal is to allow it to allocate so that it may quickly exit and free
+ * its memory.
+ */
+ if (fatal_signal_pending(current)) {
+ set_thread_flag(TIF_MEMDIE);
+ return;
+ }
+
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
@@ -689,7 +707,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
- boost_dying_task_prio(current, NULL);
return;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2cb01f6ec5d..d1960744f88 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,6 +37,16 @@
#include <trace/events/writeback.h>
/*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+#define MAX_PAUSE max(HZ/5, 1)
+
+/*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+#define BANDWIDTH_INTERVAL max(HZ/5, 1)
+
+/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
+unsigned long global_dirty_limit;
/*
* Scale the writeback cache size proportional to the relative writeout speeds.
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
*/
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
{
+ __inc_bdi_stat(bdi, BDI_WRITTEN);
__prop_inc_percpu_max(&vm_completions, &bdi->completions,
bdi->max_prop_frac);
}
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk)
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
long *numerator, long *denominator)
{
- if (bdi_cap_writeback_dirty(bdi)) {
- prop_fraction_percpu(&vm_completions, &bdi->completions,
+ prop_fraction_percpu(&vm_completions, &bdi->completions,
numerator, denominator);
- } else {
- *numerator = 0;
- *denominator = 1;
- }
}
static inline void task_dirties_fraction(struct task_struct *tsk,
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
* effectively curb the growth of dirty pages. Light dirtiers with high enough
* dirty threshold may never get throttled.
*/
+#define TASK_LIMIT_FRACTION 8
static unsigned long task_dirty_limit(struct task_struct *tsk,
unsigned long bdi_dirty)
{
long numerator, denominator;
unsigned long dirty = bdi_dirty;
- u64 inv = dirty >> 3;
+ u64 inv = dirty / TASK_LIMIT_FRACTION;
task_dirties_fraction(tsk, &numerator, &denominator);
inv *= numerator;
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk,
return max(dirty, bdi_dirty/2);
}
+/* Minimum limit for any task */
+static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+{
+ return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+}
+
/*
*
*/
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+ return max(thresh, global_dirty_limit);
+}
+
/*
* global_dirty_limits - background-writeback and dirty-throttling thresholds
*
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
}
*pbackground = background;
*pdirty = dirty;
+ trace_global_dirty_state(background, dirty);
}
-/*
+/**
* bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
+ *
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
*
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
* - starving fast devices
* - piling up dirty pages (that will take long time to sync) on slow devices
*
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
return bdi_dirty;
}
+static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+ unsigned long elapsed,
+ unsigned long written)
+{
+ const unsigned long period = roundup_pow_of_two(3 * HZ);
+ unsigned long avg = bdi->avg_write_bandwidth;
+ unsigned long old = bdi->write_bandwidth;
+ u64 bw;
+
+ /*
+ * bw = written * HZ / elapsed
+ *
+ * bw * elapsed + write_bandwidth * (period - elapsed)
+ * write_bandwidth = ---------------------------------------------------
+ * period
+ */
+ bw = written - bdi->written_stamp;
+ bw *= HZ;
+ if (unlikely(elapsed > period)) {
+ do_div(bw, elapsed);
+ avg = bw;
+ goto out;
+ }
+ bw += (u64)bdi->write_bandwidth * (period - elapsed);
+ bw >>= ilog2(period);
+
+ /*
+ * one more level of smoothing, for filtering out sudden spikes
+ */
+ if (avg > old && old >= (unsigned long)bw)
+ avg -= (avg - old) >> 3;
+
+ if (avg < old && old <= (unsigned long)bw)
+ avg += (old - avg) >> 3;
+
+out:
+ bdi->write_bandwidth = bw;
+ bdi->avg_write_bandwidth = avg;
+}
+
+/*
+ * The global dirtyable memory and dirty threshold could be suddenly knocked
+ * down by a large amount (eg. on the startup of KVM in a swapless system).
+ * This may throw the system into deep dirty exceeded state and throttle
+ * heavy/light dirtiers alike. To retain good responsiveness, maintain
+ * global_dirty_limit for tracking slowly down to the knocked down dirty
+ * threshold.
+ */
+static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+{
+ unsigned long limit = global_dirty_limit;
+
+ /*
+ * Follow up in one step.
+ */
+ if (limit < thresh) {
+ limit = thresh;
+ goto update;
+ }
+
+ /*
+ * Follow down slowly. Use the higher one as the target, because thresh
+ * may drop below dirty. This is exactly the reason to introduce
+ * global_dirty_limit which is guaranteed to lie above the dirty pages.
+ */
+ thresh = max(thresh, dirty);
+ if (limit > thresh) {
+ limit -= (limit - thresh) >> 5;
+ goto update;
+ }
+ return;
+update:
+ global_dirty_limit = limit;
+}
+
+static void global_update_bandwidth(unsigned long thresh,
+ unsigned long dirty,
+ unsigned long now)
+{
+ static DEFINE_SPINLOCK(dirty_lock);
+ static unsigned long update_time;
+
+ /*
+ * check locklessly first to optimize away locking for the most time
+ */
+ if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+ return;
+
+ spin_lock(&dirty_lock);
+ if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+ update_dirty_limit(thresh, dirty);
+ update_time = now;
+ }
+ spin_unlock(&dirty_lock);
+}
+
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long start_time)
+{
+ unsigned long now = jiffies;
+ unsigned long elapsed = now - bdi->bw_time_stamp;
+ unsigned long written;
+
+ /*
+ * rate-limit, only update once every 200ms.
+ */
+ if (elapsed < BANDWIDTH_INTERVAL)
+ return;
+
+ written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+
+ /*
+ * Skip quiet periods when disk bandwidth is under-utilized.
+ * (at least 1s idle time between two flusher runs)
+ */
+ if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+ goto snapshot;
+
+ if (thresh)
+ global_update_bandwidth(thresh, dirty, now);
+
+ bdi_update_write_bandwidth(bdi, elapsed, written);
+
+snapshot:
+ bdi->written_stamp = written;
+ bdi->bw_time_stamp = now;
+}
+
+static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long start_time)
+{
+ if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+ return;
+ spin_lock(&bdi->wb.list_lock);
+ __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+ start_time);
+ spin_unlock(&bdi->wb.list_lock);
+}
+
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
static void balance_dirty_pages(struct address_space *mapping,
unsigned long write_chunk)
{
- long nr_reclaimable, bdi_nr_reclaimable;
- long nr_writeback, bdi_nr_writeback;
+ unsigned long nr_reclaimable, bdi_nr_reclaimable;
+ unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
+ unsigned long bdi_dirty;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
+ unsigned long task_bdi_thresh;
+ unsigned long min_task_bdi_thresh;
unsigned long pages_written = 0;
unsigned long pause = 1;
bool dirty_exceeded = false;
+ bool clear_dirty_exceeded = true;
struct backing_dev_info *bdi = mapping->backing_dev_info;
+ unsigned long start_time = jiffies;
for (;;) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .nr_to_write = write_chunk,
- .range_cyclic = 1,
- };
-
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- nr_writeback = global_page_state(NR_WRITEBACK);
+ nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping,
* catch-up. This avoids (excessively) small writeouts
* when the bdi limits are ramping up.
*/
- if (nr_reclaimable + nr_writeback <=
- (background_thresh + dirty_thresh) / 2)
+ if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
break;
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
- bdi_thresh = task_dirty_limit(current, bdi_thresh);
+ min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+ task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping,
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
- if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+ if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+ bdi_dirty = bdi_nr_reclaimable +
+ bdi_stat_sum(bdi, BDI_WRITEBACK);
} else {
bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+ bdi_dirty = bdi_nr_reclaimable +
+ bdi_stat(bdi, BDI_WRITEBACK);
}
/*
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping,
* bdi or process from holding back light ones; The latter is
* the last resort safeguard.
*/
- dirty_exceeded =
- (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
- || (nr_reclaimable + nr_writeback > dirty_thresh);
+ dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+ (nr_dirty > dirty_thresh);
+ clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+ (nr_dirty <= dirty_thresh);
if (!dirty_exceeded)
break;
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping,
if (!bdi->dirty_exceeded)
bdi->dirty_exceeded = 1;
+ bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+ bdi_thresh, bdi_dirty, start_time);
+
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
@@ -557,17 +735,40 @@ static void balance_dirty_pages(struct address_space *mapping,
* threshold otherwise wait until the disk writes catch
* up.
*/
- trace_wbc_balance_dirty_start(&wbc, bdi);
- if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes_wb(&bdi->wb, &wbc);
- pages_written += write_chunk - wbc.nr_to_write;
- trace_wbc_balance_dirty_written(&wbc, bdi);
+ trace_balance_dirty_start(bdi);
+ if (bdi_nr_reclaimable > task_bdi_thresh) {
+ pages_written += writeback_inodes_wb(&bdi->wb,
+ write_chunk);
+ trace_balance_dirty_written(bdi, pages_written);
if (pages_written >= write_chunk)
break; /* We've done our duty */
}
- trace_wbc_balance_dirty_wait(&wbc, bdi);
__set_current_state(TASK_UNINTERRUPTIBLE);
io_schedule_timeout(pause);
+ trace_balance_dirty_wait(bdi);
+
+ dirty_thresh = hard_dirty_limit(dirty_thresh);
+ /*
+ * max-pause area. If dirty exceeded but still within this
+ * area, no need to sleep for more than 200ms: (a) 8 pages per
+ * 200ms is typically more than enough to curb heavy dirtiers;
+ * (b) the pause time limit makes the dirtiers more responsive.
+ */
+ if (nr_dirty < dirty_thresh +
+ dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+ time_after(jiffies, start_time + MAX_PAUSE))
+ break;
+ /*
+ * pass-good area. When some bdi gets blocked (eg. NFS server
+ * not responding), or write bandwidth dropped dramatically due
+ * to concurrent reads, or dirty threshold suddenly dropped and
+ * the dirty pages cannot be brought down anytime soon (eg. on
+ * slow USB stick), at least let go of the good bdi's.
+ */
+ if (nr_dirty < dirty_thresh +
+ dirty_thresh / DIRTY_PASSGOOD_AREA &&
+ bdi_dirty < bdi_thresh)
+ break;
/*
* Increase the delay for each loop, up to our previous
@@ -578,7 +779,8 @@ static void balance_dirty_pages(struct address_space *mapping,
pause = HZ / 10;
}
- if (!dirty_exceeded && bdi->dirty_exceeded)
+ /* Clear dirty_exceeded flag only when no task can exceed the limit */
+ if (clear_dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -626,9 +828,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long ratelimit;
unsigned long *p;
+ if (!bdi_cap_account_dirty(bdi))
+ return;
+
ratelimit = ratelimit_pages;
if (mapping->backing_dev_info->dirty_exceeded)
ratelimit = 8;
@@ -892,12 +1098,12 @@ int write_cache_pages(struct address_space *mapping,
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
}
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
retry:
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
@@ -927,7 +1133,7 @@ retry:
break;
}
- done_index = page->index + 1;
+ done_index = page->index;
lock_page(page);
@@ -977,6 +1183,7 @@ continue_unlock:
* not be suitable for data integrity
* writeout).
*/
+ done_index = page->index + 1;
done = 1;
break;
}
@@ -1039,11 +1246,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct blk_plug plug;
+ int ret;
+
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
- return write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ return ret;
}
EXPORT_SYMBOL(generic_writepages);
@@ -1134,7 +1347,6 @@ EXPORT_SYMBOL(account_page_dirtied);
void account_page_writeback(struct page *page)
{
inc_zone_page_state(page, NR_WRITEBACK);
- inc_zone_page_state(page, NR_WRITTEN);
}
EXPORT_SYMBOL(account_page_writeback);
@@ -1211,6 +1423,17 @@ int set_page_dirty(struct page *page)
if (likely(mapping)) {
int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
+ /*
+ * readahead/lru_deactivate_page could remain
+ * PG_readahead/PG_reclaim due to race with end_page_writeback
+ * About readahead, if the page is written, the flags would be
+ * reset. So no problem.
+ * About lru_deactivate_page, if the page is redirty, the flag
+ * will be reset. So no problem. but if the page is used by readahead
+ * it will confuse readahead and make it restart the size rampup
+ * process. But it's a trivial problem.
+ */
+ ClearPageReclaim(page);
#ifdef CONFIG_BLOCK
if (!spd)
spd = __set_page_dirty_buffers;
@@ -1239,7 +1462,7 @@ int set_page_dirty_lock(struct page *page)
{
int ret;
- lock_page_nosync(page);
+ lock_page(page);
ret = set_page_dirty(page);
unlock_page(page);
return ret;
@@ -1266,7 +1489,6 @@ int clear_page_dirty_for_io(struct page *page)
BUG_ON(!PageLocked(page));
- ClearPageReclaim(page);
if (mapping && mapping_cap_account_dirty(mapping)) {
/*
* Yes, Virginia, this is indeed insane.
@@ -1341,8 +1563,10 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
- if (ret)
+ if (ret) {
dec_zone_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITTEN);
+ }
return ret;
}
@@ -1388,10 +1612,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
- int ret;
- rcu_read_lock();
- ret = radix_tree_tagged(&mapping->page_tree, tag);
- rcu_read_unlock();
- return ret;
+ return radix_tree_tagged(&mapping->page_tree, tag);
}
EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cdef1d4b4e4..6e8ecb6e021 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -30,6 +30,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/notifier.h>
#include <linux/topology.h>
@@ -39,6 +40,7 @@
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/vmstat.h>
#include <linux/mempolicy.h>
#include <linux/stop_machine.h>
#include <linux/sort.h>
@@ -53,6 +55,8 @@
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -286,7 +290,7 @@ static void bad_page(struct page *page)
/* Don't complain about poisoned pages */
if (PageHWPoison(page)) {
- __ClearPageBuddy(page);
+ reset_page_mapcount(page); /* remove PageBuddy */
return;
}
@@ -317,7 +321,7 @@ static void bad_page(struct page *page)
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
- __ClearPageBuddy(page);
+ reset_page_mapcount(page); /* remove PageBuddy */
add_taint(TAINT_BAD_PAGE);
}
@@ -565,7 +569,8 @@ static inline int free_pages_check(struct page *page)
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
+ (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+ (mem_cgroup_bad_page_check(page)))) {
bad_page(page);
return 1;
}
@@ -614,6 +619,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
list = &pcp->lists[migratetype];
} while (list_empty(list));
+ /* This is the only non-empty list. Free them all. */
+ if (batch_free == MIGRATE_PCPTYPES)
+ batch_free = to_free;
+
do {
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
@@ -750,7 +759,8 @@ static inline int check_new_page(struct page *page)
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+ (mem_cgroup_bad_page_check(page)))) {
bad_page(page);
return 1;
}
@@ -863,9 +873,8 @@ static int move_freepages(struct zone *zone,
}
order = page_order(page);
- list_del(&page->lru);
- list_add(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+ list_move(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -936,7 +945,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
* If breaking a large block of pages, move all free
* pages to the preferred allocation list. If falling
* back for a reclaimable kernel allocation, be more
- * agressive about taking ownership of free pages
+ * aggressive about taking ownership of free pages
*/
if (unlikely(current_order >= (pageblock_order >> 1)) ||
start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1333,7 +1342,7 @@ again:
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(preferred_zone, zone);
+ zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
VM_BUG_ON(bad_range(zone, page));
@@ -1361,21 +1370,12 @@ failed:
#ifdef CONFIG_FAIL_PAGE_ALLOC
-static struct fail_page_alloc_attr {
+static struct {
struct fault_attr attr;
u32 ignore_gfp_highmem;
u32 ignore_gfp_wait;
u32 min_order;
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
- struct dentry *ignore_gfp_highmem_file;
- struct dentry *ignore_gfp_wait_file;
- struct dentry *min_order_file;
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
} fail_page_alloc = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
@@ -1409,36 +1409,27 @@ static int __init fail_page_alloc_debugfs(void)
{
mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
- int err;
-
- err = init_fault_attr_dentries(&fail_page_alloc.attr,
- "fail_page_alloc");
- if (err)
- return err;
- dir = fail_page_alloc.attr.dentries.dir;
-
- fail_page_alloc.ignore_gfp_wait_file =
- debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_wait);
-
- fail_page_alloc.ignore_gfp_highmem_file =
- debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem);
- fail_page_alloc.min_order_file =
- debugfs_create_u32("min-order", mode, dir,
- &fail_page_alloc.min_order);
-
- if (!fail_page_alloc.ignore_gfp_wait_file ||
- !fail_page_alloc.ignore_gfp_highmem_file ||
- !fail_page_alloc.min_order_file) {
- err = -ENOMEM;
- debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
- debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
- debugfs_remove(fail_page_alloc.min_order_file);
- cleanup_fault_attr_dentries(&fail_page_alloc.attr);
- }
- return err;
+ dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+ &fail_page_alloc.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_wait))
+ goto fail;
+ if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem))
+ goto fail;
+ if (!debugfs_create_u32("min-order", mode, dir,
+ &fail_page_alloc.min_order))
+ goto fail;
+
+ return 0;
+fail:
+ debugfs_remove_recursive(dir);
+
+ return -ENOMEM;
}
late_initcall(fail_page_alloc_debugfs);
@@ -1607,6 +1598,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
set_bit(i, zlc->fullzones);
}
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
+
#else /* CONFIG_NUMA */
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1623,6 +1629,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
}
+
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
#endif /* CONFIG_NUMA */
/*
@@ -1655,7 +1665,7 @@ zonelist_scan:
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
- goto try_next_zone;
+ continue;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1667,17 +1677,36 @@ zonelist_scan:
classzone_idx, alloc_flags))
goto try_this_zone;
+ if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+ /*
+ * we do zlc_setup if there are multiple nodes
+ * and before considering the first zone allowed
+ * by the cpuset.
+ */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ did_zlc_setup = 1;
+ }
+
if (zone_reclaim_mode == 0)
goto this_zone_full;
+ /*
+ * As we may have just activated ZLC, check if the first
+ * eligible zone has failed zone_reclaim recently.
+ */
+ if (NUMA_BUILD && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
+
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
- goto try_next_zone;
+ continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
- goto this_zone_full;
+ continue;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
@@ -1694,16 +1723,6 @@ try_this_zone:
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
-try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
- /*
- * we do zlc_setup after the first zone is tried but only
- * if there are multiple nodes make it worthwhile
- */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1714,6 +1733,59 @@ try_next_zone:
return page;
}
+/*
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
+ */
+static inline bool should_suppress_show_mem(void)
+{
+ bool ret = false;
+
+#if NODES_SHIFT > 8
+ ret = in_interrupt();
+#endif
+ return ret;
+}
+
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+ va_list args;
+ unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ return;
+
+ /*
+ * This documents exceptions given to allocations in certain
+ * contexts that are allowed to allocate outside current's set
+ * of allowed nodes.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (test_thread_flag(TIF_MEMDIE) ||
+ (current->flags & (PF_MEMALLOC | PF_EXITING)))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+ if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+
+ if (fmt) {
+ printk(KERN_WARNING);
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+ }
+
+ pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
+ current->comm, order, gfp_mask);
+
+ dump_stack();
+ if (!should_suppress_show_mem())
+ show_mem(filter);
+}
+
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
unsigned long pages_reclaimed)
@@ -1892,6 +1964,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
if (unlikely(!(*did_some_progress)))
return NULL;
+ /* After successful reclaim, reconsider all zones for allocation */
+ if (NUMA_BUILD)
+ zlc_clear_zones_full(zonelist);
+
retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
@@ -2044,6 +2120,7 @@ restart:
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);
+rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2051,7 +2128,6 @@ restart:
if (page)
goto got_pg;
-rebalance:
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2156,13 +2232,7 @@ rebalance:
}
nopage:
- if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
- printk(KERN_WARNING "%s: page allocation failure."
- " order:%d, mode:0x%x\n",
- current->comm, order, gfp_mask);
- dump_stack();
- show_mem();
- }
+ warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
@@ -2283,6 +2353,21 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
+static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+{
+ if (addr) {
+ unsigned long alloc_end = addr + (PAGE_SIZE << order);
+ unsigned long used = addr + PAGE_ALIGN(size);
+
+ split_page(virt_to_page((void *)addr), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
+ return (void *)addr;
+}
+
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
@@ -2302,22 +2387,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned long addr;
addr = __get_free_pages(gfp_mask, order);
- if (addr) {
- unsigned long alloc_end = addr + (PAGE_SIZE << order);
- unsigned long used = addr + PAGE_ALIGN(size);
-
- split_page(virt_to_page((void *)addr), order);
- while (used < alloc_end) {
- free_page(used);
- used += PAGE_SIZE;
- }
- }
-
- return (void *)addr;
+ return make_alloc_exact(addr, order, size);
}
EXPORT_SYMBOL(alloc_pages_exact);
/**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ * pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ * Note this is not alloc_pages_exact_node() which allocates on a specific node,
+ * but is not exact.
+ */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+{
+ unsigned order = get_order(size);
+ struct page *p = alloc_pages_node(nid, gfp_mask, order);
+ if (!p)
+ return NULL;
+ return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact_nid);
+
+/**
* free_pages_exact - release memory allocated via alloc_pages_exact()
* @virt: the value returned by alloc_pages_exact.
* @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -2411,19 +2507,41 @@ void si_meminfo_node(struct sysinfo *val, int nid)
}
#endif
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+bool skip_free_areas_node(unsigned int flags, int nid)
+{
+ bool ret = false;
+
+ if (!(flags & SHOW_MEM_FILTER_NODES))
+ goto out;
+
+ get_mems_allowed();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+ put_mems_allowed();
+out:
+ return ret;
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
*/
-void show_free_areas(void)
+void show_free_areas(unsigned int filter)
{
int cpu;
struct zone *zone;
for_each_populated_zone(zone) {
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ continue;
show_node(zone);
printk("%s per-cpu:\n", zone->name);
@@ -2465,6 +2583,8 @@ void show_free_areas(void)
for_each_populated_zone(zone) {
int i;
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ continue;
show_node(zone);
printk("%s"
" free:%lukB"
@@ -2532,6 +2652,8 @@ void show_free_areas(void)
for_each_populated_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ continue;
show_node(zone);
printk("%s: ", zone->name);
@@ -3110,7 +3232,7 @@ static __init_refok int __build_all_zonelists(void *data)
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*/
-void build_all_zonelists(void *data)
+void __ref build_all_zonelists(void *data)
{
set_zonelist_order();
@@ -3221,6 +3343,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
/*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+ return 1;
+ }
+ return 0;
+}
+
+/*
* Mark a number of pageblocks as MIGRATE_RESERVE. The number
* of blocks reserved is based on min_wmark_pages(zone). The memory within
* the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3229,7 +3365,7 @@ static inline unsigned long wait_table_bits(unsigned long size)
*/
static void setup_zone_migrate_reserve(struct zone *zone)
{
- unsigned long start_pfn, pfn, end_pfn;
+ unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
struct page *page;
unsigned long block_migratetype;
int reserve;
@@ -3259,7 +3395,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
continue;
/* Blocks with reserved pages will never free, skip them. */
- if (PageReserved(page))
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
continue;
block_migratetype = get_pageblock_migratetype(page);
@@ -3448,7 +3585,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
pcp->batch = PAGE_SHIFT * 8;
}
-static __meminit void setup_zone_pageset(struct zone *zone)
+static void setup_zone_pageset(struct zone *zone)
{
int cpu;
@@ -3498,7 +3635,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, alloc_size);
+ alloc_bootmem_node_nopanic(pgdat, alloc_size);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -3699,13 +3836,45 @@ void __init free_bootmem_with_active_regions(int nid,
}
#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
+{
+ int i;
+
+ for (i = nr_nodemap_entries - 1; i >= 0; i--)
+ if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+ for (index = index - 1; index >= 0; index--)
+ if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+ return index;
+
+ return -1;
+}
+
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+ for (i = last_active_region_index_in_nid(nid); i != -1; \
+ i = previous_active_region_index_in_nid(i, nid))
+
u64 __init find_memory_core_early(int nid, u64 size, u64 align,
u64 goal, u64 limit)
{
int i;
/* Need to go over early_node_map to find out good range for node */
- for_each_active_range_index_in_nid(i, nid) {
+ for_each_active_range_index_in_nid_reverse(i, nid) {
u64 addr;
u64 ei_start, ei_last;
u64 final_start, final_end;
@@ -3748,34 +3917,6 @@ int __init add_from_early_node_map(struct range *range, int az,
return nr_range;
}
-#ifdef CONFIG_NO_BOOTMEM
-void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
- u64 goal, u64 limit)
-{
- void *ptr;
- u64 addr;
-
- if (limit > memblock.current_limit)
- limit = memblock.current_limit;
-
- addr = find_memory_core_early(nid, size, align, goal, limit);
-
- if (addr == MEMBLOCK_ERROR)
- return NULL;
-
- ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
- memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks.
- */
- kmemleak_alloc(ptr, size, 0, 0);
- return ptr;
-}
-#endif
-
-
void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
{
int i;
@@ -3856,7 +3997,7 @@ static void __init find_usable_zone_for_movable(void)
/*
* The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independant of architecture. Unlike the other zones,
+ * because it is sized independent of architecture. Unlike the other zones,
* the starting point for ZONE_MOVABLE is not fixed. It may be different
* in each node depending on the size of each node and how evenly kernelcore
* is distributed. This helper function adjusts the zone ranges
@@ -4071,7 +4212,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
unsigned long usemapsize = usemap_size(zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
- zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+ zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+ usemapsize);
}
#else
static inline void setup_usemap(struct pglist_data *pgdat,
@@ -4191,10 +4333,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- for_each_lru(l) {
+ for_each_lru(l)
INIT_LIST_HEAD(&zone->lru[l].list);
- zone->reclaim_stat.nr_saved_scan[l] = 0;
- }
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4237,7 +4377,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = alloc_bootmem_node(pgdat, size);
+ map = alloc_bootmem_node_nopanic(pgdat, size);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4459,6 +4599,60 @@ void __init sort_node_map(void)
cmp_node_active_region, NULL);
}
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
+ * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+ unsigned long accl_mask = 0, last_end = 0;
+ int last_nid = -1;
+ int i;
+
+ for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
+ int nid = early_node_map[i].nid;
+ unsigned long start = early_node_map[i].start_pfn;
+ unsigned long end = early_node_map[i].end_pfn;
+ unsigned long mask;
+
+ if (!start || last_nid < 0 || last_nid == nid) {
+ last_nid = nid;
+ last_end = end;
+ continue;
+ }
+
+ /*
+ * Start with a mask granular enough to pin-point to the
+ * start pfn and tick off bits one-by-one until it becomes
+ * too coarse to separate the current node from the last.
+ */
+ mask = ~((1 << __ffs(start)) - 1);
+ while (mask && last_end <= (start & (mask << 1)))
+ mask <<= 1;
+
+ /* accumulate all internode masks */
+ accl_mask |= mask;
+ }
+
+ /* convert mask to number of pages */
+ return ~accl_mask + 1;
+}
+
/* Find the lowest pfn for a node */
static unsigned long __init find_min_pfn_for_node(int nid)
{
@@ -4809,15 +5003,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
dma_reserve = new_dma_reserve;
}
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_node(0, zones_size,
@@ -5011,7 +5196,7 @@ void setup_per_zone_wmarks(void)
* 1TB 101 10GB
* 10TB 320 32GB
*/
-void calculate_zone_inactive_ratio(struct zone *zone)
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
{
unsigned int gb, ratio;
@@ -5025,7 +5210,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
zone->inactive_ratio = ratio;
}
-static void __init setup_per_zone_inactive_ratio(void)
+static void __meminit setup_per_zone_inactive_ratio(void)
{
struct zone *zone;
@@ -5057,7 +5242,7 @@ static void __init setup_per_zone_inactive_ratio(void)
* 8192MB: 11584k
* 16384MB: 16384k
*/
-static int __init init_per_zone_wmark_min(void)
+int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
@@ -5069,6 +5254,7 @@ static int __init init_per_zone_wmark_min(void)
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
setup_per_zone_wmarks();
+ refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
setup_per_zone_inactive_ratio();
return 0;
@@ -5419,10 +5605,8 @@ int set_migratetype_isolate(struct page *page)
struct memory_isolate_notify arg;
int notifier_ret;
int ret = -EBUSY;
- int zone_idx;
zone = page_zone(page);
- zone_idx = zone_idx(zone);
spin_lock_irqsave(&zone->lock, flags);
@@ -5626,4 +5810,5 @@ void dump_page(struct page *page)
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
+ mem_cgroup_print_bad_page(page);
}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5bffada7cde..39d216d535e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,12 +11,11 @@
#include <linux/swapops.h>
#include <linux/kmemleak.h>
-static void __meminit
-__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
+static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
{
pc->flags = 0;
+ set_page_cgroup_array_id(pc, id);
pc->mem_cgroup = NULL;
- pc->page = pfn_to_page(pfn);
INIT_LIST_HEAD(&pc->lru);
}
static unsigned long total_usage;
@@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
return base + offset;
}
+struct page *lookup_cgroup_page(struct page_cgroup *pc)
+{
+ unsigned long pfn;
+ struct page *page;
+ pg_data_t *pgdat;
+
+ pgdat = NODE_DATA(page_cgroup_array_id(pc));
+ pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
+ page = pfn_to_page(pfn);
+ VM_BUG_ON(pc != lookup_page_cgroup(page));
+ return page;
+}
+
static int __init alloc_node_page_cgroup(int nid)
{
struct page_cgroup *base, *pc;
@@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid)
return -ENOMEM;
for (index = 0; index < nr_pages; index++) {
pc = base + index;
- __init_page_cgroup(pc, start_pfn + index);
+ init_page_cgroup(pc, nid);
}
NODE_DATA(nid)->node_page_cgroup = base;
total_usage += table_size;
@@ -105,46 +117,74 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
return section->page_cgroup + pfn;
}
-/* __alloc_bootmem...() is protected by !slab_available() */
-static int __init_refok init_section_page_cgroup(unsigned long pfn)
+struct page *lookup_cgroup_page(struct page_cgroup *pc)
{
- struct mem_section *section = __pfn_to_section(pfn);
- struct page_cgroup *base, *pc;
- unsigned long table_size;
- int nid, index;
-
- if (!section->page_cgroup) {
- nid = page_to_nid(pfn_to_page(pfn));
- table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
- VM_BUG_ON(!slab_is_available());
- if (node_state(nid, N_HIGH_MEMORY)) {
- base = kmalloc_node(table_size,
- GFP_KERNEL | __GFP_NOWARN, nid);
- if (!base)
- base = vmalloc_node(table_size, nid);
- } else {
- base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
- if (!base)
- base = vmalloc(table_size);
- }
- /*
- * The value stored in section->page_cgroup is (base - pfn)
- * and it does not point to the memory block allocated above,
- * causing kmemleak false positives.
- */
- kmemleak_not_leak(base);
+ struct mem_section *section;
+ struct page *page;
+ unsigned long nr;
+
+ nr = page_cgroup_array_id(pc);
+ section = __nr_to_section(nr);
+ page = pfn_to_page(pc - section->page_cgroup);
+ VM_BUG_ON(pc != lookup_page_cgroup(page));
+ return page;
+}
+
+static void *__meminit alloc_page_cgroup(size_t size, int nid)
+{
+ void *addr = NULL;
+
+ addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
+ if (addr)
+ return addr;
+
+ if (node_state(nid, N_HIGH_MEMORY))
+ addr = vmalloc_node(size, nid);
+ else
+ addr = vmalloc(size);
+
+ return addr;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_page_cgroup(void *addr)
+{
+ if (is_vmalloc_addr(addr)) {
+ vfree(addr);
} else {
- /*
- * We don't have to allocate page_cgroup again, but
- * address of memmap may be changed. So, we have to initialize
- * again.
- */
- base = section->page_cgroup + pfn;
- table_size = 0;
- /* check address of memmap is changed or not. */
- if (base->page == pfn_to_page(pfn))
- return 0;
+ struct page *page = virt_to_page(addr);
+ size_t table_size =
+ sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+
+ BUG_ON(PageReserved(page));
+ free_pages_exact(addr, table_size);
}
+}
+#endif
+
+static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
+{
+ struct page_cgroup *base, *pc;
+ struct mem_section *section;
+ unsigned long table_size;
+ unsigned long nr;
+ int index;
+
+ nr = pfn_to_section_nr(pfn);
+ section = __nr_to_section(nr);
+
+ if (section->page_cgroup)
+ return 0;
+
+ table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+ base = alloc_page_cgroup(table_size, nid);
+
+ /*
+ * The value stored in section->page_cgroup is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);
if (!base) {
printk(KERN_ERR "page cgroup allocation failure\n");
@@ -153,9 +193,13 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
for (index = 0; index < PAGES_PER_SECTION; index++) {
pc = base + index;
- __init_page_cgroup(pc, pfn + index);
+ init_page_cgroup(pc, nr);
}
-
+ /*
+ * The passed "pfn" may not be aligned to SECTION. For the calculation
+ * we need to apply a mask.
+ */
+ pfn &= PAGE_SECTION_MASK;
section->page_cgroup = base - pfn;
total_usage += table_size;
return 0;
@@ -170,16 +214,8 @@ void __free_page_cgroup(unsigned long pfn)
if (!ms || !ms->page_cgroup)
return;
base = ms->page_cgroup + pfn;
- if (is_vmalloc_addr(base)) {
- vfree(base);
- ms->page_cgroup = NULL;
- } else {
- struct page *page = virt_to_page(base);
- if (!PageReserved(page)) { /* Is bootmem ? */
- kfree(base);
- ms->page_cgroup = NULL;
- }
- }
+ free_page_cgroup(base);
+ ms->page_cgroup = NULL;
}
int __meminit online_page_cgroup(unsigned long start_pfn,
@@ -189,13 +225,23 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
unsigned long start, end, pfn;
int fail = 0;
- start = start_pfn & ~(PAGES_PER_SECTION - 1);
- end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+
+ if (nid == -1) {
+ /*
+ * In this case, "nid" already exists and contains valid memory.
+ * "start_pfn" passed to us is a pfn which is an arg for
+ * online__pages(), and start_pfn should exist.
+ */
+ nid = pfn_to_nid(start_pfn);
+ VM_BUG_ON(!node_state(nid, N_ONLINE));
+ }
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
if (!pfn_present(pfn))
continue;
- fail = init_section_page_cgroup(pfn);
+ fail = init_section_page_cgroup(pfn, nid);
}
if (!fail)
return 0;
@@ -212,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn,
{
unsigned long start, end, pfn;
- start = start_pfn & ~(PAGES_PER_SECTION - 1);
- end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_cgroup(pfn);
@@ -243,12 +289,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
break;
}
- if (ret)
- ret = notifier_from_errno(ret);
- else
- ret = NOTIFY_OK;
-
- return ret;
+ return notifier_from_errno(ret);
}
#endif
@@ -256,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
void __init page_cgroup_init(void)
{
unsigned long pfn;
- int fail = 0;
+ int nid;
if (mem_cgroup_disabled())
return;
- for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
- if (!pfn_present(pfn))
- continue;
- fail = init_section_page_cgroup(pfn);
- }
- if (fail) {
- printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
- panic("Out of memory");
- } else {
- hotplug_memory_notifier(page_cgroup_callback, 0);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = node_start_pfn(nid);
+ end_pfn = node_end_pfn(nid);
+ /*
+ * start_pfn and end_pfn may not be aligned to SECTION and the
+ * page->flags of out of node pages are not initialized. So we
+ * scan [start_pfn, the biggest section's pfn < end_pfn) here.
+ */
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+
+ if (!pfn_valid(pfn))
+ continue;
+ /*
+ * Nodes's pfns can be overlapping.
+ * We know some arch can have a nodes layout such as
+ * -------------pfn-------------->
+ * N0 | N1 | N2 | N0 | N1 | N2|....
+ */
+ if (pfn_to_nid(pfn) != nid)
+ continue;
+ if (init_section_page_cgroup(pfn, nid))
+ goto oom;
+ }
}
+ hotplug_memory_notifier(page_cgroup_callback, 0);
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
- " want memory cgroups\n");
+ printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
+ "don't want memory cgroups\n");
+ return;
+oom:
+ printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+ panic("Out of memory");
}
void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
@@ -349,7 +412,7 @@ not_enough_page:
* @new: new id
*
* Returns old id at success, 0 at failure.
- * (There is no mem_cgroup useing 0 as its id)
+ * (There is no mem_cgroup using 0 as its id)
*/
unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
unsigned short old, unsigned short new)
@@ -447,7 +510,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
if (!do_swap_account)
return 0;
- length = ((max_pages/SC_PER_PAGE) + 1);
+ length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array_size = length * sizeof(void *);
array = vmalloc(array_size);
@@ -464,8 +527,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
/* memory shortage */
ctrl->map = NULL;
ctrl->length = 0;
- vfree(array);
mutex_unlock(&swap_cgroup_mutex);
+ vfree(array);
goto nomem;
}
mutex_unlock(&swap_cgroup_mutex);
@@ -474,13 +537,14 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
nomem:
printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
printk(KERN_INFO
- "swap_cgroup can be disabled by noswapaccount boot option\n");
+ "swap_cgroup can be disabled by swapaccount=0 boot option\n");
return -ENOMEM;
}
void swap_cgroup_swapoff(int type)
{
- int i;
+ struct page **map;
+ unsigned long i, length;
struct swap_cgroup_ctrl *ctrl;
if (!do_swap_account)
@@ -488,17 +552,20 @@ void swap_cgroup_swapoff(int type)
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
- if (ctrl->map) {
- for (i = 0; i < ctrl->length; i++) {
- struct page *page = ctrl->map[i];
+ map = ctrl->map;
+ length = ctrl->length;
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+
+ if (map) {
+ for (i = 0; i < length; i++) {
+ struct page *page = map[i];
if (page)
__free_page(page);
}
- vfree(ctrl->map);
- ctrl->map = NULL;
- ctrl->length = 0;
+ vfree(map);
}
- mutex_unlock(&swap_cgroup_mutex);
}
#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dee975bf46..dc76b4d0611 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
goto out;
}
if (wbc->sync_mode == WB_SYNC_ALL)
- rw |= REQ_SYNC | REQ_UNPLUG;
+ rw |= REQ_SYNC;
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7cfa6ae0230..2f5cf10ff66 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -33,19 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
pmd = pmd_offset(pud, addr);
do {
+again:
next = pmd_addr_end(addr, end);
- split_huge_page_pmd(walk->mm, pmd);
- if (pmd_none_or_clear_bad(pmd)) {
+ if (pmd_none(*pmd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
+ /*
+ * This implies that each ->pmd_entry() handler
+ * needs to know about pmd_trans_huge() pmds
+ */
if (walk->pmd_entry)
err = walk->pmd_entry(pmd, addr, next, walk);
- if (!err && walk->pte_entry)
- err = walk_pte_range(pmd, addr, next, walk);
+ if (err)
+ break;
+
+ /*
+ * Check this here so we only break down trans_huge
+ * pages when we _need_ to
+ */
+ if (!walk->pte_entry)
+ continue;
+
+ split_huge_page_pmd(walk->mm, pmd);
+ if (pmd_none_or_clear_bad(pmd))
+ goto again;
+ err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
@@ -110,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
return 0;
}
-#endif
+
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma;
+
+ /* We don't need vma lookup at all. */
+ if (!walk->hugetlb_entry)
+ return NULL;
+
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+ vma = find_vma(walk->mm, addr);
+ if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
+ return vma;
+
+ return NULL;
+}
+
+#else /* CONFIG_HUGETLB_PAGE */
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+ return NULL;
+}
+
+static int walk_hugetlb_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ return 0;
+}
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+
/**
* walk_page_range - walk a memory map's page tables with a callback
@@ -128,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
* associated range, and a copy of the original mm_walk for access to
* the ->private or ->mm fields.
*
- * No locks are taken, but the bottom level iterator will map PTE
+ * Usually no locks are taken, but splitting transparent huge page may
+ * take page table lock. And the bottom level iterator will map PTE
* directories from highmem if necessary.
*
* If any callback returns a non-zero value, the walk is aborted and
* the return value is propagated back to the caller. Otherwise 0 is returned.
+ *
+ * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
+ * is !NULL.
*/
int walk_page_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -149,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd = pgd_offset(walk->mm, addr);
do {
- struct vm_area_struct *uninitialized_var(vma);
+ struct vm_area_struct *vma;
next = pgd_addr_end(addr, end);
-#ifdef CONFIG_HUGETLB_PAGE
/*
* handle hugetlb vma individually because pagetable walk for
* the hugetlb page is dependent on the architecture and
* we can't handled it in the same manner as non-huge pages.
*/
- vma = find_vma(walk->mm, addr);
- if (vma && is_vm_hugetlb_page(vma)) {
+ vma = hugetlb_vma(addr, walk);
+ if (vma) {
if (vma->vm_end < next)
next = vma->vm_end;
/*
@@ -173,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd = pgd_offset(walk->mm, next);
continue;
}
-#endif
+
if (pgd_none_or_clear_bad(pgd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
diff --git a/mm/percpu.c b/mm/percpu.c
index 3f930018aa6..bf80e55dbed 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -342,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
* @chunk: chunk of interest
*
* Determine whether area map of @chunk needs to be extended to
- * accomodate a new allocation.
+ * accommodate a new allocation.
*
* CONTEXT:
* pcpu_lock.
@@ -431,7 +431,7 @@ out_unlock:
* depending on @head, is reduced by @tail bytes and @tail byte block
* is inserted after the target block.
*
- * @chunk->map must have enough free slots to accomodate the split.
+ * @chunk->map must have enough free slots to accommodate the split.
*
* CONTEXT:
* pcpu_lock.
@@ -1008,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
}
if (in_first_chunk) {
- if ((unsigned long)addr < VMALLOC_START ||
- (unsigned long)addr >= VMALLOC_END)
+ if (!is_vmalloc_addr(addr))
return __pa(addr);
else
return page_to_phys(vmalloc_to_page(addr));
@@ -1216,8 +1215,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
+ PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
#endif
PCPU_SETUP_BUG_ON(!base_addr);
+ PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
@@ -1436,7 +1437,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of atom_size and is the smallest
- * which can accomodate 4k aligned segments which are equal to
+ * which can accommodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
@@ -1551,7 +1552,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
* @alloc_fn: function to allocate percpu page
- * @free_fn: funtion to free percpu page
+ * @free_fn: function to free percpu page
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
@@ -1646,8 +1647,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
/* warn if maximum distance is further than 75% of vmalloc space */
if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
- "space 0x%lx\n",
- max_distance, VMALLOC_END - VMALLOC_START);
+ "space 0x%lx\n", max_distance,
+ (unsigned long)(VMALLOC_END - VMALLOC_START));
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
@@ -1679,7 +1680,7 @@ out_free:
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
- * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @free_fn: function to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
* This is a helper to ease setting up page-remapped first percpu
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
index 603ae98d969..799dcfd7cd8 100644
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -13,6 +13,7 @@
#include <linux/mm.h>
#include <linux/prio_tree.h>
+#include <linux/prefetch.h>
/*
* See lib/prio_tree.c for details on the general radix priority search tree
diff --git a/mm/readahead.c b/mm/readahead.c
index 77506a291a2..867f9dd82dc 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages);
static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
+ struct blk_plug plug;
unsigned page_idx;
int ret;
+ blk_start_plug(&plug);
+
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
page_cache_release(page);
}
ret = 0;
+
out:
+ blk_finish_plug(&plug);
+
return ret;
}
@@ -174,7 +180,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (page)
continue;
- page = page_cache_alloc_cold(mapping);
+ page = page_cache_alloc_readahead(mapping);
if (!page)
break;
page->index = page_offset;
@@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping,
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, true, offset, req_size);
-
-#ifdef CONFIG_BLOCK
- /*
- * Normally the current page is !uptodate and lock_page() will be
- * immediately called to implicitly unplug the device. However this
- * is not always true for RAID conifgurations, where data arrives
- * not strictly in their submission order. In this case we need to
- * explicitly kick off the IO.
- */
- if (PageUptodate(page))
- blk_run_backing_dev(mapping->backing_dev_info, NULL);
-#endif
}
EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index f21f4a1d6a1..8005080fb9e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,25 +21,24 @@
* Lock ordering in mm:
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
- * inode->i_alloc_sem (vmtruncate_range)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
- * mapping->i_mmap_lock
- * anon_vma->lock
+ * mapping->i_mmap_mutex
+ * anon_vma->mutex
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
- * inode_lock (in set_page_dirty's __mark_inode_dirty)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
- * within inode_lock in __sync_single_inode)
+ * within bdi.wb->list_lock in __sync_single_inode)
*
- * (code doesn't rely on that order so it could be switched around)
- * ->tasklist_lock
- * anon_vma->lock (memory_failure, collect_procs_anon)
+ * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * ->tasklist_lock
* pte map lock
*/
@@ -67,17 +66,53 @@ static struct kmem_cache *anon_vma_chain_cachep;
static inline struct anon_vma *anon_vma_alloc(void)
{
- return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+ struct anon_vma *anon_vma;
+
+ anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+ if (anon_vma) {
+ atomic_set(&anon_vma->refcount, 1);
+ /*
+ * Initialise the anon_vma root to point to itself. If called
+ * from fork, the root will be reset to the parents anon_vma.
+ */
+ anon_vma->root = anon_vma;
+ }
+
+ return anon_vma;
}
-void anon_vma_free(struct anon_vma *anon_vma)
+static inline void anon_vma_free(struct anon_vma *anon_vma)
{
+ VM_BUG_ON(atomic_read(&anon_vma->refcount));
+
+ /*
+ * Synchronize against page_lock_anon_vma() such that
+ * we can safely hold the lock without the anon_vma getting
+ * freed.
+ *
+ * Relies on the full mb implied by the atomic_dec_and_test() from
+ * put_anon_vma() against the acquire barrier implied by
+ * mutex_trylock() from page_lock_anon_vma(). This orders:
+ *
+ * page_lock_anon_vma() VS put_anon_vma()
+ * mutex_trylock() atomic_dec_and_test()
+ * LOCK MB
+ * atomic_read() mutex_is_locked()
+ *
+ * LOCK should suffice since the actual taking of the lock must
+ * happen _before_ what follows.
+ */
+ if (mutex_is_locked(&anon_vma->root->mutex)) {
+ anon_vma_lock(anon_vma);
+ anon_vma_unlock(anon_vma);
+ }
+
kmem_cache_free(anon_vma_cachep, anon_vma);
}
-static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
- return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+ return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}
static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@ -122,7 +157,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *allocated;
- avc = anon_vma_chain_alloc();
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_enomem;
@@ -133,11 +168,6 @@ int anon_vma_prepare(struct vm_area_struct *vma)
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
allocated = anon_vma;
- /*
- * This VMA had no anon_vma yet. This anon_vma is
- * the root of any anon_vma tree that might form.
- */
- anon_vma->root = anon_vma;
}
anon_vma_lock(anon_vma);
@@ -156,7 +186,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
anon_vma_unlock(anon_vma);
if (unlikely(allocated))
- anon_vma_free(allocated);
+ put_anon_vma(allocated);
if (unlikely(avc))
anon_vma_chain_free(avc);
}
@@ -168,6 +198,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
return -ENOMEM;
}
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+{
+ struct anon_vma *new_root = anon_vma->root;
+ if (new_root != root) {
+ if (WARN_ON_ONCE(root))
+ mutex_unlock(&root->mutex);
+ root = new_root;
+ mutex_lock(&root->mutex);
+ }
+ return root;
+}
+
+static inline void unlock_anon_vma_root(struct anon_vma *root)
+{
+ if (root)
+ mutex_unlock(&root->mutex);
+}
+
static void anon_vma_chain_link(struct vm_area_struct *vma,
struct anon_vma_chain *avc,
struct anon_vma *anon_vma)
@@ -176,13 +232,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- anon_vma_lock(anon_vma);
/*
* It's critical to add new vmas to the tail of the anon_vma,
* see comment in huge_memory.c:__split_huge_page().
*/
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
- anon_vma_unlock(anon_vma);
}
/*
@@ -192,13 +246,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
+ struct anon_vma *root = NULL;
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- avc = anon_vma_chain_alloc();
- if (!avc)
- goto enomem_failure;
- anon_vma_chain_link(dst, avc, pavc->anon_vma);
+ struct anon_vma *anon_vma;
+
+ avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ if (unlikely(!avc)) {
+ unlock_anon_vma_root(root);
+ root = NULL;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+ }
+ anon_vma = pavc->anon_vma;
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_chain_link(dst, avc, anon_vma);
}
+ unlock_anon_vma_root(root);
return 0;
enomem_failure:
@@ -231,7 +296,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
- avc = anon_vma_chain_alloc();
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
@@ -241,58 +306,63 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
*/
anon_vma->root = pvma->anon_vma->root;
/*
- * With KSM refcounts, an anon_vma can stay around longer than the
- * process it belongs to. The root anon_vma needs to be pinned
- * until this anon_vma is freed, because the lock lives in the root.
+ * With refcounts, an anon_vma can stay around longer than the
+ * process it belongs to. The root anon_vma needs to be pinned until
+ * this anon_vma is freed, because the lock lives in the root.
*/
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_lock(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_unlock(anon_vma);
return 0;
out_error_free_anon_vma:
- anon_vma_free(anon_vma);
+ put_anon_vma(anon_vma);
out_error:
unlink_anon_vmas(vma);
return -ENOMEM;
}
-static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
+void unlink_anon_vmas(struct vm_area_struct *vma)
{
- struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
- int empty;
+ struct anon_vma_chain *avc, *next;
+ struct anon_vma *root = NULL;
- /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
- if (!anon_vma)
- return;
+ /*
+ * Unlink each anon_vma chained to the VMA. This list is ordered
+ * from newest to oldest, ensuring the root anon_vma gets freed last.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
- anon_vma_lock(anon_vma);
- list_del(&anon_vma_chain->same_anon_vma);
+ root = lock_anon_vma_root(root, anon_vma);
+ list_del(&avc->same_anon_vma);
- /* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
- anon_vma_unlock(anon_vma);
+ /*
+ * Leave empty anon_vmas on the list - we'll need
+ * to free them outside the lock.
+ */
+ if (list_empty(&anon_vma->head))
+ continue;
- if (empty) {
- /* We no longer need the root anon_vma */
- if (anon_vma->root != anon_vma)
- drop_anon_vma(anon_vma->root);
- anon_vma_free(anon_vma);
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
}
-}
-
-void unlink_anon_vmas(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc, *next;
+ unlock_anon_vma_root(root);
/*
- * Unlink each anon_vma chained to the VMA. This list is ordered
- * from newest to oldest, ensuring the root anon_vma gets freed last.
+ * Iterate the list once more, it now only contains empty and unlinked
+ * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+ * needing to acquire the anon_vma->root->mutex.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
- anon_vma_unlink(avc);
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ put_anon_vma(anon_vma);
+
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
@@ -302,8 +372,8 @@ static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
- spin_lock_init(&anon_vma->lock);
- anonvma_external_refcount_init(anon_vma);
+ mutex_init(&anon_vma->mutex);
+ atomic_set(&anon_vma->refcount, 0);
INIT_LIST_HEAD(&anon_vma->head);
}
@@ -315,12 +385,31 @@ void __init anon_vma_init(void)
}
/*
- * Getting a lock on a stable anon_vma from a page off the LRU is
- * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
+ *
+ * Since there is no serialization what so ever against page_remove_rmap()
+ * the best this function can do is return a locked anon_vma that might
+ * have been relevant to this page.
+ *
+ * The page might have been remapped to a different anon_vma or the anon_vma
+ * returned may already be freed (and even reused).
+ *
+ * In case it was remapped to a different anon_vma, the new anon_vma will be a
+ * child of the old anon_vma, and the anon_vma lifetime rules will therefore
+ * ensure that any anon_vma obtained from the page will still be valid for as
+ * long as we observe page_mapped() [ hence all those page_mapped() tests ].
+ *
+ * All users of this function must be very careful when walking the anon_vma
+ * chain and verify that the page in question is indeed mapped in it
+ * [ something equivalent to page_mapped_in_vma() ].
+ *
+ * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
+ * that the anon_vma pointer from page->mapping is valid if there is a
+ * mapcount, we can dereference the anon_vma after observing those.
*/
-struct anon_vma *__page_lock_anon_vma(struct page *page)
+struct anon_vma *page_get_anon_vma(struct page *page)
{
- struct anon_vma *anon_vma, *root_anon_vma;
+ struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
@@ -331,32 +420,100 @@ struct anon_vma *__page_lock_anon_vma(struct page *page)
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- root_anon_vma = ACCESS_ONCE(anon_vma->root);
- spin_lock(&root_anon_vma->lock);
+ if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ anon_vma = NULL;
+ goto out;
+ }
/*
* If this page is still mapped, then its anon_vma cannot have been
- * freed. But if it has been unmapped, we have no security against
- * the anon_vma structure being freed and reused (for another anon_vma:
- * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
- * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
- * anon_vma->root before page_unlock_anon_vma() is called to unlock.
+ * freed. But if it has been unmapped, we have no security against the
+ * anon_vma structure being freed and reused (for another anon_vma:
+ * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
+ * above cannot corrupt).
*/
- if (page_mapped(page))
- return anon_vma;
+ if (!page_mapped(page)) {
+ put_anon_vma(anon_vma);
+ anon_vma = NULL;
+ }
+out:
+ rcu_read_unlock();
+
+ return anon_vma;
+}
+
+/*
+ * Similar to page_get_anon_vma() except it locks the anon_vma.
+ *
+ * Its a little more complex as it tries to keep the fast path to a single
+ * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
+ * reference like with page_get_anon_vma() and then block on the mutex.
+ */
+struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+ struct anon_vma *anon_vma = NULL;
+ struct anon_vma *root_anon_vma;
+ unsigned long anon_mapping;
+
+ rcu_read_lock();
+ anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ goto out;
+ if (!page_mapped(page))
+ goto out;
+
+ anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ root_anon_vma = ACCESS_ONCE(anon_vma->root);
+ if (mutex_trylock(&root_anon_vma->mutex)) {
+ /*
+ * If the page is still mapped, then this anon_vma is still
+ * its anon_vma, and holding the mutex ensures that it will
+ * not go away, see anon_vma_free().
+ */
+ if (!page_mapped(page)) {
+ mutex_unlock(&root_anon_vma->mutex);
+ anon_vma = NULL;
+ }
+ goto out;
+ }
+
+ /* trylock failed, we got to sleep */
+ if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ anon_vma = NULL;
+ goto out;
+ }
+
+ if (!page_mapped(page)) {
+ put_anon_vma(anon_vma);
+ anon_vma = NULL;
+ goto out;
+ }
+
+ /* we pinned the anon_vma, its safe to sleep */
+ rcu_read_unlock();
+ anon_vma_lock(anon_vma);
+
+ if (atomic_dec_and_test(&anon_vma->refcount)) {
+ /*
+ * Oops, we held the last refcount, release the lock
+ * and bail -- can't simply use put_anon_vma() because
+ * we'll deadlock on the anon_vma_lock() recursion.
+ */
+ anon_vma_unlock(anon_vma);
+ __put_anon_vma(anon_vma);
+ anon_vma = NULL;
+ }
+
+ return anon_vma;
- spin_unlock(&root_anon_vma->lock);
out:
rcu_read_unlock();
- return NULL;
+ return anon_vma;
}
void page_unlock_anon_vma(struct anon_vma *anon_vma)
- __releases(&anon_vma->root->lock)
- __releases(RCU)
{
anon_vma_unlock(anon_vma);
- rcu_read_unlock();
}
/*
@@ -497,41 +654,51 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
int referenced = 0;
- /*
- * Don't want to elevate referenced for mlocked page that gets this far,
- * in order that it progresses to try_to_unmap and is moved to the
- * unevictable list.
- */
- if (vma->vm_flags & VM_LOCKED) {
- *mapcount = 0; /* break early from loop */
- *vm_flags |= VM_LOCKED;
- goto out;
- }
-
- /* Pretend the page is referenced if the task has the
- swap token and is in the middle of a page fault. */
- if (mm != current->mm && has_swap_token(mm) &&
- rwsem_is_locked(&mm->mmap_sem))
- referenced++;
-
if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;
spin_lock(&mm->page_table_lock);
+ /*
+ * rmap might return false positives; we must filter
+ * these out using page_check_address_pmd().
+ */
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_FLAG);
- if (pmd && !pmd_trans_splitting(*pmd) &&
- pmdp_clear_flush_young_notify(vma, address, pmd))
+ if (!pmd) {
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+
+ if (vma->vm_flags & VM_LOCKED) {
+ spin_unlock(&mm->page_table_lock);
+ *mapcount = 0; /* break early from loop */
+ *vm_flags |= VM_LOCKED;
+ goto out;
+ }
+
+ /* go ahead even if the pmd is pmd_trans_splitting() */
+ if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
spin_unlock(&mm->page_table_lock);
} else {
pte_t *pte;
spinlock_t *ptl;
+ /*
+ * rmap might return false positives; we must filter
+ * these out using page_check_address().
+ */
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
+ if (vma->vm_flags & VM_LOCKED) {
+ pte_unmap_unlock(pte, ptl);
+ *mapcount = 0; /* break early from loop */
+ *vm_flags |= VM_LOCKED;
+ goto out;
+ }
+
if (ptep_clear_flush_young_notify(vma, address, pte)) {
/*
* Don't treat a reference through a sequentially read
@@ -546,6 +713,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
+ /* Pretend the page is referenced if the task has the
+ swap token and is in the middle of a page fault. */
+ if (mm != current->mm && has_swap_token(mm) &&
+ rwsem_is_locked(&mm->mmap_sem))
+ referenced++;
+
(*mapcount)--;
if (referenced)
@@ -625,14 +798,14 @@ static int page_referenced_file(struct page *page,
* The page lock not only makes sure that page->mapping cannot
* suddenly be NULLified by truncation, it makes sure that the
* structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_lock.
+ * so we can safely take mapping->i_mmap_mutex.
*/
BUG_ON(!PageLocked(page));
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
/*
- * i_mmap_lock does not stabilize mapcount at all, but mapcount
+ * i_mmap_mutex does not stabilize mapcount at all, but mapcount
* is more likely to be accurate if we note it after spinning.
*/
mapcount = page_mapcount(page);
@@ -654,7 +827,7 @@ static int page_referenced_file(struct page *page,
break;
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return referenced;
}
@@ -696,11 +869,11 @@ int page_referenced(struct page *page,
vm_flags);
if (we_locked)
unlock_page(page);
+
+ if (page_test_and_clear_young(page_to_pfn(page)))
+ referenced++;
}
out:
- if (page_test_and_clear_young(page))
- referenced++;
-
return referenced;
}
@@ -741,7 +914,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
BUG_ON(PageAnon(page));
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
if (vma->vm_flags & VM_SHARED) {
unsigned long address = vma_address(page, vma);
@@ -750,7 +923,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
ret += page_mkclean_one(page, vma, address);
}
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
@@ -764,10 +937,8 @@ int page_mkclean(struct page *page)
struct address_space *mapping = page_mapping(page);
if (mapping) {
ret = page_mkclean_file(mapping, page);
- if (page_test_dirty(page)) {
- page_clear_dirty(page, 1);
+ if (page_test_and_clear_dirty(page_to_pfn(page), 1))
ret = 1;
- }
}
}
@@ -893,7 +1064,7 @@ void do_page_add_anon_rmap(struct page *page,
return;
VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ /* address might be in next vma when migration races vma_adjust */
if (first)
__page_set_anon_rmap(page, vma, address, exclusive);
else
@@ -960,10 +1131,9 @@ void page_remove_rmap(struct page *page)
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
*/
- if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
- page_clear_dirty(page, 1);
+ if ((!PageAnon(page) || PageSwapCache(page)) &&
+ page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
- }
/*
* Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
* and not charged by memcg for now.
@@ -1101,7 +1271,7 @@ out_mlock:
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->lock or mapping->i_mmap_lock.
+ * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
@@ -1327,7 +1497,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
unsigned long max_nl_size = 0;
unsigned int mapcount;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
@@ -1373,7 +1543,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
mapcount = page_mapcount(page);
if (!mapcount)
goto out;
- cond_resched_lock(&mapping->i_mmap_lock);
+ cond_resched();
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
if (max_nl_cursor == 0)
@@ -1395,7 +1565,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
}
vma->vm_private_data = (void *) max_nl_cursor;
}
- cond_resched_lock(&mapping->i_mmap_lock);
+ cond_resched();
max_nl_cursor += CLUSTER_SIZE;
} while (max_nl_cursor <= max_nl_size);
@@ -1407,7 +1577,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
vma->vm_private_data = NULL;
out:
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
@@ -1470,41 +1640,15 @@ int try_to_munlock(struct page *page)
return try_to_unmap_file(page, TTU_MUNLOCK);
}
-#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
-/*
- * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
- * if necessary. Be careful to do all the tests under the lock. Once
- * we know we are the last user, nobody else can get a reference and we
- * can do the freeing without the lock.
- */
-void drop_anon_vma(struct anon_vma *anon_vma)
+void __put_anon_vma(struct anon_vma *anon_vma)
{
- BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
- if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
- struct anon_vma *root = anon_vma->root;
- int empty = list_empty(&anon_vma->head);
- int last_root_user = 0;
- int root_empty = 0;
+ struct anon_vma *root = anon_vma->root;
- /*
- * The refcount on a non-root anon_vma got dropped. Drop
- * the refcount on the root and check if we need to free it.
- */
- if (empty && anon_vma != root) {
- BUG_ON(atomic_read(&root->external_refcount) <= 0);
- last_root_user = atomic_dec_and_test(&root->external_refcount);
- root_empty = list_empty(&root->head);
- }
- anon_vma_unlock(anon_vma);
+ if (root != anon_vma && atomic_dec_and_test(&root->refcount))
+ anon_vma_free(root);
- if (empty) {
- anon_vma_free(anon_vma);
- if (root_empty && last_root_user)
- anon_vma_free(root);
- }
- }
+ anon_vma_free(anon_vma);
}
-#endif
#ifdef CONFIG_MIGRATION
/*
@@ -1552,7 +1696,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
if (!mapping)
return ret;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
if (address == -EFAULT)
@@ -1566,7 +1710,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
* never contain migration ptes. Decide what to do about this
* limitation to linear when we need rmap_walk() on nonlinear.
*/
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
@@ -1615,7 +1759,7 @@ void hugepage_add_anon_rmap(struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
- BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ /* address might be in next vma when migration races vma_adjust */
first = atomic_inc_and_test(&page->_mapcount);
if (first)
__hugepage_set_anon_rmap(page, vma, address, 0);
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ee67c99060..32f6763f16f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
* 2000-2001 Christoph Rohland
* 2000-2001 SAP AG
* 2002 Red Hat Inc.
- * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2011 Hugh Dickins.
+ * Copyright (C) 2011 Google Inc.
* Copyright (C) 2002-2005 VERITAS Software Corporation.
* Copyright (C) 2004 Andi Kleen, SuSE Labs
*
@@ -28,7 +29,6 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/module.h>
-#include <linux/percpu_counter.h>
#include <linux/swap.h>
static struct vfsmount *shm_mnt;
@@ -51,6 +51,9 @@ static struct vfsmount *shm_mnt;
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/percpu_counter.h>
+#include <linux/splice.h>
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
@@ -62,44 +65,25 @@ static struct vfsmount *shm_mnt;
#include <linux/magic.h>
#include <asm/uaccess.h>
-#include <asm/div64.h>
#include <asm/pgtable.h>
-/*
- * The maximum size of a shmem/tmpfs file is limited by the maximum size of
- * its triple-indirect swap vector - see illustration at shmem_swp_entry().
- *
- * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
- * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
- * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
- * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
- *
- * We use / and * instead of shifts in the definitions below, so that the swap
- * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
- */
-#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
-#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
-
-#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
-#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
-
-#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
-#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
-
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
-/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
-#define SHMEM_PAGEIN VM_READ
-#define SHMEM_TRUNCATE VM_WRITE
-
-/* Definition to limit shmem_truncate's steps between cond_rescheds */
-#define LATENCY_LIMIT 64
-
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
-/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+#define SHORT_SYMLINK_LEN 128
+
+struct shmem_xattr {
+ struct list_head list; /* anchored by shmem_inode_info->xattr_list */
+ char *name; /* xattr name */
+ size_t size;
+ char value[0];
+};
+
+/* Flag allocation requirements to shmem_getpage */
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
@@ -119,57 +103,14 @@ static unsigned long shmem_default_max_inodes(void)
}
#endif
-static int shmem_getpage(struct inode *inode, unsigned long idx,
- struct page **pagep, enum sgp_type sgp, int *type);
-
-static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
-{
- /*
- * The above definition of ENTRIES_PER_PAGE, and the use of
- * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
- * might be reconsidered if it ever diverges from PAGE_SIZE.
- *
- * Mobility flags are masked out as swap vectors cannot move
- */
- return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
- PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static inline void shmem_dir_free(struct page *page)
-{
- __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static struct page **shmem_dir_map(struct page *page)
-{
- return (struct page **)kmap_atomic(page, KM_USER0);
-}
-
-static inline void shmem_dir_unmap(struct page **dir)
-{
- kunmap_atomic(dir, KM_USER0);
-}
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
-static swp_entry_t *shmem_swp_map(struct page *page)
+static inline int shmem_getpage(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, int *fault_type)
{
- return (swp_entry_t *)kmap_atomic(page, KM_USER1);
-}
-
-static inline void shmem_swp_balance_unmap(void)
-{
- /*
- * When passing a pointer to an i_direct entry, to code which
- * also handles indirect entries and so will shmem_swp_unmap,
- * we must arrange for the preempt count to remain in balance.
- * What kmap_atomic of a lowmem page does depends on config
- * and architecture, so pretend to kmap_atomic some lowmem page.
- */
- (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
-}
-
-static inline void shmem_swp_unmap(swp_entry_t *entry)
-{
- kunmap_atomic(entry, KM_USER1);
+ return shmem_getpage_gfp(inode, index, pagep, sgp,
+ mapping_gfp_mask(inode->i_mapping), fault_type);
}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -224,23 +165,11 @@ static const struct vm_operations_struct shmem_vm_ops;
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
- .unplug_io_fn = default_unplug_io_fn,
};
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
-static void shmem_free_blocks(struct inode *inode, long pages)
-{
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_blocks) {
- percpu_counter_add(&sbinfo->used_blocks, -pages);
- spin_lock(&inode->i_lock);
- inode->i_blocks -= pages*BLOCKS_PER_PAGE;
- spin_unlock(&inode->i_lock);
- }
-}
-
static int shmem_reserve_inode(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -267,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)
}
/**
- * shmem_recalc_inode - recalculate the size of an inode
+ * shmem_recalc_inode - recalculate the block usage of an inode
* @inode: inode to recalc
*
* We have to calculate the free blocks since the mm can drop
@@ -285,530 +214,324 @@ static void shmem_recalc_inode(struct inode *inode)
freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
if (freed > 0) {
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -freed);
info->alloced -= freed;
+ inode->i_blocks -= freed * BLOCKS_PER_PAGE;
shmem_unacct_blocks(info->flags, freed);
- shmem_free_blocks(inode, freed);
}
}
-/**
- * shmem_swp_entry - find the swap vector position in the info structure
- * @info: info structure for the inode
- * @index: index of the page to find
- * @page: optional page to add to the structure. Has to be preset to
- * all zeros
- *
- * If there is no space allocated yet it will return NULL when
- * page is NULL, else it will use the page for the needed block,
- * setting it to NULL on return to indicate that it has been used.
- *
- * The swap vector is organized the following way:
- *
- * There are SHMEM_NR_DIRECT entries directly stored in the
- * shmem_inode_info structure. So small files do not need an addional
- * allocation.
- *
- * For pages with index > SHMEM_NR_DIRECT there is the pointer
- * i_indirect which points to a page which holds in the first half
- * doubly indirect blocks, in the second half triple indirect blocks:
- *
- * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
- * following layout (for SHMEM_NR_DIRECT == 16):
- *
- * i_indirect -> dir --> 16-19
- * | +-> 20-23
- * |
- * +-->dir2 --> 24-27
- * | +-> 28-31
- * | +-> 32-35
- * | +-> 36-39
- * |
- * +-->dir3 --> 40-43
- * +-> 44-47
- * +-> 48-51
- * +-> 52-55
+/*
+ * Replace item expected in radix tree by a new item, while holding tree lock.
*/
-static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
-{
- unsigned long offset;
- struct page **dir;
- struct page *subdir;
+static int shmem_radix_tree_replace(struct address_space *mapping,
+ pgoff_t index, void *expected, void *replacement)
+{
+ void **pslot;
+ void *item = NULL;
+
+ VM_BUG_ON(!expected);
+ pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
+ if (pslot)
+ item = radix_tree_deref_slot_protected(pslot,
+ &mapping->tree_lock);
+ if (item != expected)
+ return -ENOENT;
+ if (replacement)
+ radix_tree_replace_slot(pslot, replacement);
+ else
+ radix_tree_delete(&mapping->page_tree, index);
+ return 0;
+}
- if (index < SHMEM_NR_DIRECT) {
- shmem_swp_balance_unmap();
- return info->i_direct+index;
- }
- if (!info->i_indirect) {
- if (page) {
- info->i_indirect = *page;
- *page = NULL;
- }
- return NULL; /* need another page */
- }
+/*
+ * Like add_to_page_cache_locked, but error if expected item has gone.
+ */
+static int shmem_add_to_page_cache(struct page *page,
+ struct address_space *mapping,
+ pgoff_t index, gfp_t gfp, void *expected)
+{
+ int error = 0;
- index -= SHMEM_NR_DIRECT;
- offset = index % ENTRIES_PER_PAGE;
- index /= ENTRIES_PER_PAGE;
- dir = shmem_dir_map(info->i_indirect);
-
- if (index >= ENTRIES_PER_PAGE/2) {
- index -= ENTRIES_PER_PAGE/2;
- dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
- index %= ENTRIES_PER_PAGE;
- subdir = *dir;
- if (!subdir) {
- if (page) {
- *dir = *page;
- *page = NULL;
- }
- shmem_dir_unmap(dir);
- return NULL; /* need another page */
- }
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(subdir);
- }
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageSwapBacked(page));
- dir += index;
- subdir = *dir;
- if (!subdir) {
- if (!page || !(subdir = *page)) {
- shmem_dir_unmap(dir);
- return NULL; /* need a page */
+ if (!expected)
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ if (!error) {
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = index;
+
+ spin_lock_irq(&mapping->tree_lock);
+ if (!expected)
+ error = radix_tree_insert(&mapping->page_tree,
+ index, page);
+ else
+ error = shmem_radix_tree_replace(mapping, index,
+ expected, page);
+ if (!error) {
+ mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ __inc_zone_page_state(page, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ } else {
+ page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page);
}
- *dir = subdir;
- *page = NULL;
+ if (!expected)
+ radix_tree_preload_end();
}
- shmem_dir_unmap(dir);
- return shmem_swp_map(subdir) + offset;
+ if (error)
+ mem_cgroup_uncharge_cache_page(page);
+ return error;
}
-static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
+/*
+ * Like delete_from_page_cache, but substitutes swap for page.
+ */
+static void shmem_delete_from_page_cache(struct page *page, void *radswap)
{
- long incdec = value? 1: -1;
+ struct address_space *mapping = page->mapping;
+ int error;
- entry->val = value;
- info->swapped += incdec;
- if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
- struct page *page = kmap_atomic_to_page(entry);
- set_page_private(page, page_private(page) + incdec);
- }
+ spin_lock_irq(&mapping->tree_lock);
+ error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+ page->mapping = NULL;
+ mapping->nrpages--;
+ __dec_zone_page_state(page, NR_FILE_PAGES);
+ __dec_zone_page_state(page, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page);
+ BUG_ON(error);
}
-/**
- * shmem_swp_alloc - get the position of the swap entry for the page.
- * @info: info structure for the inode
- * @index: index of the page to find
- * @sgp: check and recheck i_size? skip allocation?
- *
- * If the entry does not exist, allocate it.
+/*
+ * Like find_get_pages, but collecting swap entries as well as pages.
*/
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
-{
- struct inode *inode = &info->vfs_inode;
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- struct page *page = NULL;
- swp_entry_t *entry;
-
- if (sgp != SGP_WRITE &&
- ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- return ERR_PTR(-EINVAL);
-
- while (!(entry = shmem_swp_entry(info, index, &page))) {
- if (sgp == SGP_READ)
- return shmem_swp_map(ZERO_PAGE(0));
- /*
- * Test used_blocks against 1 less max_blocks, since we have 1 data
- * page (and perhaps indirect index pages) yet to allocate:
- * a waste to allocate index if we cannot allocate data.
- */
- if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
- return ERR_PTR(-ENOSPC);
- percpu_counter_inc(&sbinfo->used_blocks);
- spin_lock(&inode->i_lock);
- inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&inode->i_lock);
+static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_pages,
+ struct page **pages, pgoff_t *indices)
+{
+ unsigned int i;
+ unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+ (void ***)pages, indices, start, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto restart;
+ /*
+ * Otherwise, we must be storing a swap entry
+ * here as an exceptional entry: so return it
+ * without attempting to raise page count.
+ */
+ goto export;
}
+ if (!page_cache_get_speculative(page))
+ goto repeat;
- spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
- spin_lock(&info->lock);
-
- if (!page) {
- shmem_free_blocks(inode, 1);
- return ERR_PTR(-ENOMEM);
- }
- if (sgp != SGP_WRITE &&
- ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
- entry = ERR_PTR(-EINVAL);
- break;
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ page_cache_release(page);
+ goto repeat;
}
- if (info->next_index <= index)
- info->next_index = index + 1;
+export:
+ indices[ret] = indices[i];
+ pages[ret] = page;
+ ret++;
}
- if (page) {
- /* another task gave its page, or truncated the file */
- shmem_free_blocks(inode, 1);
- shmem_dir_free(page);
- }
- if (info->next_index <= index && !IS_ERR(entry))
- info->next_index = index + 1;
- return entry;
+ if (unlikely(!ret && nr_found))
+ goto restart;
+ rcu_read_unlock();
+ return ret;
}
-/**
- * shmem_free_swp - free some swap entries in a directory
- * @dir: pointer to the directory
- * @edir: pointer after last entry of the directory
- * @punch_lock: pointer to spinlock when needed for the holepunch case
+/*
+ * Remove swap entry from radix tree, free the swap and its page cache.
*/
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
- spinlock_t *punch_lock)
-{
- spinlock_t *punch_unlock = NULL;
- swp_entry_t *ptr;
- int freed = 0;
-
- for (ptr = dir; ptr < edir; ptr++) {
- if (ptr->val) {
- if (unlikely(punch_lock)) {
- punch_unlock = punch_lock;
- punch_lock = NULL;
- spin_lock(punch_unlock);
- if (!ptr->val)
- continue;
- }
- free_swap_and_cache(*ptr);
- *ptr = (swp_entry_t){0};
- freed++;
- }
- }
- if (punch_unlock)
- spin_unlock(punch_unlock);
- return freed;
-}
-
-static int shmem_map_and_free_swp(struct page *subdir, int offset,
- int limit, struct page ***dir, spinlock_t *punch_lock)
-{
- swp_entry_t *ptr;
- int freed = 0;
-
- ptr = shmem_swp_map(subdir);
- for (; offset < limit; offset += LATENCY_LIMIT) {
- int size = limit - offset;
- if (size > LATENCY_LIMIT)
- size = LATENCY_LIMIT;
- freed += shmem_free_swp(ptr+offset, ptr+offset+size,
- punch_lock);
- if (need_resched()) {
- shmem_swp_unmap(ptr);
- if (*dir) {
- shmem_dir_unmap(*dir);
- *dir = NULL;
- }
- cond_resched();
- ptr = shmem_swp_map(subdir);
- }
- }
- shmem_swp_unmap(ptr);
- return freed;
+static int shmem_free_swap(struct address_space *mapping,
+ pgoff_t index, void *radswap)
+{
+ int error;
+
+ spin_lock_irq(&mapping->tree_lock);
+ error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+ spin_unlock_irq(&mapping->tree_lock);
+ if (!error)
+ free_swap_and_cache(radix_to_swp_entry(radswap));
+ return error;
}
-static void shmem_free_pages(struct list_head *next)
+/*
+ * Pagevec may contain swap entries, so shuffle up pages before releasing.
+ */
+static void shmem_pagevec_release(struct pagevec *pvec)
{
- struct page *page;
- int freed = 0;
-
- do {
- page = container_of(next, struct page, lru);
- next = next->next;
- shmem_dir_free(page);
- freed++;
- if (freed >= LATENCY_LIMIT) {
- cond_resched();
- freed = 0;
- }
- } while (next);
+ int i, j;
+
+ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ if (!radix_tree_exceptional_entry(page))
+ pvec->pages[j++] = page;
+ }
+ pvec->nr = j;
+ pagevec_release(pvec);
}
-static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+/*
+ * Remove range of pages and swap entries from radix tree, and free them.
+ */
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
+ struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- unsigned long idx;
- unsigned long size;
- unsigned long limit;
- unsigned long stage;
- unsigned long diroff;
- struct page **dir;
- struct page *topdir;
- struct page *middir;
- struct page *subdir;
- swp_entry_t *ptr;
- LIST_HEAD(pages_to_free);
- long nr_pages_to_free = 0;
+ pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+ pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
long nr_swaps_freed = 0;
- int offset;
- int freed;
- int punch_hole;
- spinlock_t *needs_lock;
- spinlock_t *punch_lock;
- unsigned long upper_limit;
+ pgoff_t index;
+ int i;
+
+ BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+
+ pagevec_init(&pvec, 0);
+ index = start;
+ while (index <= end) {
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ pvec.pages, indices);
+ if (!pvec.nr)
+ break;
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (idx >= info->next_index)
- return;
+ index = indices[i];
+ if (index > end)
+ break;
- spin_lock(&info->lock);
- info->flags |= SHMEM_TRUNCATE;
- if (likely(end == (loff_t) -1)) {
- limit = info->next_index;
- upper_limit = SHMEM_MAX_INDEX;
- info->next_index = idx;
- needs_lock = NULL;
- punch_hole = 0;
- } else {
- if (end + 1 >= inode->i_size) { /* we may free a little more */
- limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- upper_limit = SHMEM_MAX_INDEX;
- } else {
- limit = (end + 1) >> PAGE_CACHE_SHIFT;
- upper_limit = limit;
- }
- needs_lock = &info->lock;
- punch_hole = 1;
- }
+ if (radix_tree_exceptional_entry(page)) {
+ nr_swaps_freed += !shmem_free_swap(mapping,
+ index, page);
+ continue;
+ }
- topdir = info->i_indirect;
- if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
- info->i_indirect = NULL;
- nr_pages_to_free++;
- list_add(&topdir->lru, &pages_to_free);
+ if (!trylock_page(page))
+ continue;
+ if (page->mapping == mapping) {
+ VM_BUG_ON(PageWriteback(page));
+ truncate_inode_page(mapping, page);
+ }
+ unlock_page(page);
+ }
+ shmem_pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ cond_resched();
+ index++;
}
- spin_unlock(&info->lock);
- if (info->swapped && idx < SHMEM_NR_DIRECT) {
- ptr = info->i_direct;
- size = limit;
- if (size > SHMEM_NR_DIRECT)
- size = SHMEM_NR_DIRECT;
- nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
+ if (partial) {
+ struct page *page = NULL;
+ shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+ if (page) {
+ zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
}
- /*
- * If there are no indirect blocks or we are punching a hole
- * below indirect blocks, nothing to be done.
- */
- if (!topdir || limit <= SHMEM_NR_DIRECT)
- goto done2;
+ index = start;
+ for ( ; ; ) {
+ cond_resched();
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ pvec.pages, indices);
+ if (!pvec.nr) {
+ if (index == start)
+ break;
+ index = start;
+ continue;
+ }
+ if (index == start && indices[0] > end) {
+ shmem_pagevec_release(&pvec);
+ break;
+ }
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
- /*
- * The truncation case has already dropped info->lock, and we're safe
- * because i_size and next_index have already been lowered, preventing
- * access beyond. But in the punch_hole case, we still need to take
- * the lock when updating the swap directory, because there might be
- * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
- * shmem_writepage. However, whenever we find we can remove a whole
- * directory page (not at the misaligned start or end of the range),
- * we first NULLify its pointer in the level above, and then have no
- * need to take the lock when updating its contents: needs_lock and
- * punch_lock (either pointing to info->lock or NULL) manage this.
- */
+ index = indices[i];
+ if (index > end)
+ break;
- upper_limit -= SHMEM_NR_DIRECT;
- limit -= SHMEM_NR_DIRECT;
- idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
- offset = idx % ENTRIES_PER_PAGE;
- idx -= offset;
-
- dir = shmem_dir_map(topdir);
- stage = ENTRIES_PER_PAGEPAGE/2;
- if (idx < ENTRIES_PER_PAGEPAGE/2) {
- middir = topdir;
- diroff = idx/ENTRIES_PER_PAGE;
- } else {
- dir += ENTRIES_PER_PAGE/2;
- dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
- while (stage <= idx)
- stage += ENTRIES_PER_PAGEPAGE;
- middir = *dir;
- if (*dir) {
- diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
- ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
- if (!diroff && !offset && upper_limit >= stage) {
- if (needs_lock) {
- spin_lock(needs_lock);
- *dir = NULL;
- spin_unlock(needs_lock);
- needs_lock = NULL;
- } else
- *dir = NULL;
- nr_pages_to_free++;
- list_add(&middir->lru, &pages_to_free);
+ if (radix_tree_exceptional_entry(page)) {
+ nr_swaps_freed += !shmem_free_swap(mapping,
+ index, page);
+ continue;
}
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(middir);
- } else {
- diroff = 0;
- offset = 0;
- idx = stage;
- }
- }
- for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
- if (unlikely(idx == stage)) {
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(topdir) +
- ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
- while (!*dir) {
- dir++;
- idx += ENTRIES_PER_PAGEPAGE;
- if (idx >= limit)
- goto done1;
- }
- stage = idx + ENTRIES_PER_PAGEPAGE;
- middir = *dir;
- if (punch_hole)
- needs_lock = &info->lock;
- if (upper_limit >= stage) {
- if (needs_lock) {
- spin_lock(needs_lock);
- *dir = NULL;
- spin_unlock(needs_lock);
- needs_lock = NULL;
- } else
- *dir = NULL;
- nr_pages_to_free++;
- list_add(&middir->lru, &pages_to_free);
+ lock_page(page);
+ if (page->mapping == mapping) {
+ VM_BUG_ON(PageWriteback(page));
+ truncate_inode_page(mapping, page);
}
- shmem_dir_unmap(dir);
- cond_resched();
- dir = shmem_dir_map(middir);
- diroff = 0;
- }
- punch_lock = needs_lock;
- subdir = dir[diroff];
- if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
- if (needs_lock) {
- spin_lock(needs_lock);
- dir[diroff] = NULL;
- spin_unlock(needs_lock);
- punch_lock = NULL;
- } else
- dir[diroff] = NULL;
- nr_pages_to_free++;
- list_add(&subdir->lru, &pages_to_free);
- }
- if (subdir && page_private(subdir) /* has swap entries */) {
- size = limit - idx;
- if (size > ENTRIES_PER_PAGE)
- size = ENTRIES_PER_PAGE;
- freed = shmem_map_and_free_swp(subdir,
- offset, size, &dir, punch_lock);
- if (!dir)
- dir = shmem_dir_map(middir);
- nr_swaps_freed += freed;
- if (offset || punch_lock) {
- spin_lock(&info->lock);
- set_page_private(subdir,
- page_private(subdir) - freed);
- spin_unlock(&info->lock);
- } else
- BUG_ON(page_private(subdir) != freed);
+ unlock_page(page);
}
- offset = 0;
- }
-done1:
- shmem_dir_unmap(dir);
-done2:
- if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
- /*
- * Call truncate_inode_pages again: racing shmem_unuse_inode
- * may have swizzled a page in from swap since
- * truncate_pagecache or generic_delete_inode did it, before we
- * lowered next_index. Also, though shmem_getpage checks
- * i_size before adding to cache, no recheck after: so fix the
- * narrow window there too.
- *
- * Recalling truncate_inode_pages_range and unmap_mapping_range
- * every time for punch_hole (which never got a chance to clear
- * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
- * yet hardly ever necessary: try to optimize them out later.
- */
- truncate_inode_pages_range(inode->i_mapping, start, end);
- if (punch_hole)
- unmap_mapping_range(inode->i_mapping, start,
- end - start, 1);
+ shmem_pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ index++;
}
spin_lock(&info->lock);
- info->flags &= ~SHMEM_TRUNCATE;
info->swapped -= nr_swaps_freed;
- if (nr_pages_to_free)
- shmem_free_blocks(inode, nr_pages_to_free);
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
- /*
- * Empty swap vector directory pages to be freed?
- */
- if (!list_empty(&pages_to_free)) {
- pages_to_free.prev->next = NULL;
- shmem_free_pages(pages_to_free.next);
- }
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
- loff_t newsize = attr->ia_size;
int error;
error = inode_change_ok(inode, attr);
if (error)
return error;
- if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
- && newsize != inode->i_size) {
- struct page *page = NULL;
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ loff_t oldsize = inode->i_size;
+ loff_t newsize = attr->ia_size;
- if (newsize < inode->i_size) {
- /*
- * If truncating down to a partial page, then
- * if that page is already allocated, hold it
- * in memory until the truncation is over, so
- * truncate_partial_page cannnot miss it were
- * it assigned to swap.
- */
- if (newsize & (PAGE_CACHE_SIZE-1)) {
- (void) shmem_getpage(inode,
- newsize >> PAGE_CACHE_SHIFT,
- &page, SGP_READ, NULL);
- if (page)
- unlock_page(page);
- }
- /*
- * Reset SHMEM_PAGEIN flag so that shmem_truncate can
- * detect if any pages might have been added to cache
- * after truncate_inode_pages. But we needn't bother
- * if it's being fully truncated to zero-length: the
- * nrpages check is efficient enough in that case.
- */
- if (newsize) {
- struct shmem_inode_info *info = SHMEM_I(inode);
- spin_lock(&info->lock);
- info->flags &= ~SHMEM_PAGEIN;
- spin_unlock(&info->lock);
- }
+ if (newsize != oldsize) {
+ i_size_write(inode, newsize);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ }
+ if (newsize < oldsize) {
+ loff_t holebegin = round_up(newsize, PAGE_SIZE);
+ unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+ shmem_truncate_range(inode, newsize, (loff_t)-1);
+ /* unmap again to remove racily COWed private pages */
+ unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
}
-
- /* XXX(truncate): truncate_setsize should be called last */
- truncate_setsize(inode, newsize);
- if (page)
- page_cache_release(page);
- shmem_truncate_range(inode, newsize, (loff_t)-1);
}
setattr_copy(inode, attr);
@@ -822,9 +545,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_xattr *xattr, *nxattr;
if (inode->i_mapping->a_ops == &shmem_aops) {
- truncate_inode_pages(inode->i_mapping, 0);
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -833,200 +556,111 @@ static void shmem_evict_inode(struct inode *inode)
list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
}
+ } else
+ kfree(info->symlink);
+
+ list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
+ kfree(xattr->name);
+ kfree(xattr);
}
BUG_ON(inode->i_blocks);
shmem_free_inode(inode->i_sb);
end_writeback(inode);
}
-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
-{
- swp_entry_t *ptr;
-
- for (ptr = dir; ptr < edir; ptr++) {
- if (ptr->val == entry.val)
- return ptr - dir;
- }
- return -1;
-}
-
-static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
+/*
+ * If swap found in inode, free it and move page from swapcache to filecache.
+ */
+static int shmem_unuse_inode(struct shmem_inode_info *info,
+ swp_entry_t swap, struct page *page)
{
- struct inode *inode;
- unsigned long idx;
- unsigned long size;
- unsigned long limit;
- unsigned long stage;
- struct page **dir;
- struct page *subdir;
- swp_entry_t *ptr;
- int offset;
+ struct address_space *mapping = info->vfs_inode.i_mapping;
+ void *radswap;
+ pgoff_t index;
int error;
- idx = 0;
- ptr = info->i_direct;
- spin_lock(&info->lock);
- if (!info->swapped) {
- list_del_init(&info->swaplist);
- goto lost2;
- }
- limit = info->next_index;
- size = limit;
- if (size > SHMEM_NR_DIRECT)
- size = SHMEM_NR_DIRECT;
- offset = shmem_find_swp(entry, ptr, ptr+size);
- if (offset >= 0)
- goto found;
- if (!info->i_indirect)
- goto lost2;
-
- dir = shmem_dir_map(info->i_indirect);
- stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
-
- for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
- if (unlikely(idx == stage)) {
- shmem_dir_unmap(dir-1);
- if (cond_resched_lock(&info->lock)) {
- /* check it has not been truncated */
- if (limit > info->next_index) {
- limit = info->next_index;
- if (idx >= limit)
- goto lost2;
- }
- }
- dir = shmem_dir_map(info->i_indirect) +
- ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
- while (!*dir) {
- dir++;
- idx += ENTRIES_PER_PAGEPAGE;
- if (idx >= limit)
- goto lost1;
- }
- stage = idx + ENTRIES_PER_PAGEPAGE;
- subdir = *dir;
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(subdir);
- }
- subdir = *dir;
- if (subdir && page_private(subdir)) {
- ptr = shmem_swp_map(subdir);
- size = limit - idx;
- if (size > ENTRIES_PER_PAGE)
- size = ENTRIES_PER_PAGE;
- offset = shmem_find_swp(entry, ptr, ptr+size);
- shmem_swp_unmap(ptr);
- if (offset >= 0) {
- shmem_dir_unmap(dir);
- goto found;
- }
- }
- }
-lost1:
- shmem_dir_unmap(dir-1);
-lost2:
- spin_unlock(&info->lock);
- return 0;
-found:
- idx += offset;
- inode = igrab(&info->vfs_inode);
- spin_unlock(&info->lock);
+ radswap = swp_to_radix_entry(swap);
+ index = radix_tree_locate_item(&mapping->page_tree, radswap);
+ if (index == -1)
+ return 0;
/*
* Move _head_ to start search for next from here.
* But be careful: shmem_evict_inode checks list_empty without taking
* mutex, and there's an instant in list_move_tail when info->swaplist
- * would appear empty, if it were the only one on shmem_swaplist. We
- * could avoid doing it if inode NULL; or use this minor optimization.
+ * would appear empty, if it were the only one on shmem_swaplist.
*/
if (shmem_swaplist.next != &info->swaplist)
list_move_tail(&shmem_swaplist, &info->swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
- error = 1;
- if (!inode)
- goto out;
/*
- * Charge page using GFP_KERNEL while we can wait.
- * Charged back to the user(not to caller) when swap account is used.
- * add_to_page_cache() will be called with GFP_NOWAIT.
+ * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
+ * but also to hold up shmem_evict_inode(): so inode cannot be freed
+ * beneath us (pagelock doesn't help until the page is in pagecache).
*/
- error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
- if (error)
- goto out;
- error = radix_tree_preload(GFP_KERNEL);
- if (error) {
- mem_cgroup_uncharge_cache_page(page);
- goto out;
- }
- error = 1;
+ error = shmem_add_to_page_cache(page, mapping, index,
+ GFP_NOWAIT, radswap);
+ /* which does mem_cgroup_uncharge_cache_page on error */
- spin_lock(&info->lock);
- ptr = shmem_swp_entry(info, idx, NULL);
- if (ptr && ptr->val == entry.val) {
- error = add_to_page_cache_locked(page, inode->i_mapping,
- idx, GFP_NOWAIT);
- /* does mem_cgroup_uncharge_cache_page on error */
- } else /* we must compensate for our precharge above */
- mem_cgroup_uncharge_cache_page(page);
-
- if (error == -EEXIST) {
- struct page *filepage = find_get_page(inode->i_mapping, idx);
- error = 1;
- if (filepage) {
- /*
- * There might be a more uptodate page coming down
- * from a stacked writepage: forget our swappage if so.
- */
- if (PageUptodate(filepage))
- error = 0;
- page_cache_release(filepage);
- }
- }
- if (!error) {
+ if (error != -ENOMEM) {
+ /*
+ * Truncation and eviction use free_swap_and_cache(), which
+ * only does trylock page: if we raced, best clean up here.
+ */
delete_from_swap_cache(page);
set_page_dirty(page);
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, ptr, 0);
- swap_free(entry);
+ if (!error) {
+ spin_lock(&info->lock);
+ info->swapped--;
+ spin_unlock(&info->lock);
+ swap_free(swap);
+ }
error = 1; /* not an error, but entry was found */
}
- if (ptr)
- shmem_swp_unmap(ptr);
- spin_unlock(&info->lock);
- radix_tree_preload_end();
-out:
- unlock_page(page);
- page_cache_release(page);
- iput(inode); /* allows for NULL */
return error;
}
/*
- * shmem_unuse() search for an eventually swapped out shmem page.
+ * Search through swapped inodes to find and replace swap by page.
*/
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
{
- struct list_head *p, *next;
+ struct list_head *this, *next;
struct shmem_inode_info *info;
int found = 0;
+ int error;
+
+ /*
+ * Charge page using GFP_KERNEL while we can wait, before taking
+ * the shmem_swaplist_mutex which might hold up shmem_writepage().
+ * Charged back to the user (not to caller) when swap account is used.
+ */
+ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+ if (error)
+ goto out;
+ /* No radix_tree_preload: swap entry keeps a place for page in tree */
mutex_lock(&shmem_swaplist_mutex);
- list_for_each_safe(p, next, &shmem_swaplist) {
- info = list_entry(p, struct shmem_inode_info, swaplist);
- found = shmem_unuse_inode(info, entry, page);
+ list_for_each_safe(this, next, &shmem_swaplist) {
+ info = list_entry(this, struct shmem_inode_info, swaplist);
+ if (info->swapped)
+ found = shmem_unuse_inode(info, swap, page);
+ else
+ list_del_init(&info->swaplist);
cond_resched();
if (found)
- goto out;
+ break;
}
mutex_unlock(&shmem_swaplist_mutex);
- /*
- * Can some race bring us here? We've been holding page lock,
- * so I think not; but would rather try again later than BUG()
- */
+
+ if (!found)
+ mem_cgroup_uncharge_cache_page(page);
+ if (found < 0)
+ error = found;
+out:
unlock_page(page);
page_cache_release(page);
-out:
- return (found < 0) ? found : 0;
+ return error;
}
/*
@@ -1035,10 +669,10 @@ out:
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
struct shmem_inode_info *info;
- swp_entry_t *entry, swap;
struct address_space *mapping;
- unsigned long index;
struct inode *inode;
+ swp_entry_t swap;
+ pgoff_t index;
BUG_ON(!PageLocked(page));
mapping = page->mapping;
@@ -1053,63 +687,46 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
/*
* shmem_backing_dev_info's capabilities prevent regular writeback or
* sync from ever calling shmem_writepage; but a stacking filesystem
- * may use the ->writepage of its underlying filesystem, in which case
+ * might use ->writepage of its underlying filesystem, in which case
* tmpfs should write out to swap only in response to memory pressure,
- * and not for the writeback threads or sync. However, in those cases,
- * we do still want to check if there's a redundant swappage to be
- * discarded.
+ * and not for the writeback threads or sync.
*/
- if (wbc->for_reclaim)
- swap = get_swap_page();
- else
- swap.val = 0;
-
- spin_lock(&info->lock);
- if (index >= info->next_index) {
- BUG_ON(!(info->flags & SHMEM_TRUNCATE));
- goto unlock;
- }
- entry = shmem_swp_entry(info, index, NULL);
- if (entry->val) {
- /*
- * The more uptodate page coming down from a stacked
- * writepage should replace our old swappage.
- */
- free_swap_and_cache(*entry);
- shmem_swp_set(info, entry, 0);
+ if (!wbc->for_reclaim) {
+ WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
+ goto redirty;
}
- shmem_recalc_inode(inode);
+ swap = get_swap_page();
+ if (!swap.val)
+ goto redirty;
- if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
- remove_from_page_cache(page);
- shmem_swp_set(info, entry, swap.val);
- shmem_swp_unmap(entry);
- if (list_empty(&info->swaplist))
- inode = igrab(inode);
- else
- inode = NULL;
- spin_unlock(&info->lock);
+ /*
+ * Add inode to shmem_unuse()'s list of swapped-out inodes,
+ * if it's not already there. Do it now before the page is
+ * moved to swap cache, when its pagelock no longer protects
+ * the inode from eviction. But don't unlock the mutex until
+ * we've incremented swapped, because shmem_unuse_inode() will
+ * prune a !swapped inode from the swaplist under this mutex.
+ */
+ mutex_lock(&shmem_swaplist_mutex);
+ if (list_empty(&info->swaplist))
+ list_add_tail(&info->swaplist, &shmem_swaplist);
+
+ if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
swap_shmem_alloc(swap);
+ shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+
+ spin_lock(&info->lock);
+ info->swapped++;
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+
+ mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(page_mapped(page));
- page_cache_release(page); /* pagecache ref */
swap_writepage(page, wbc);
- if (inode) {
- mutex_lock(&shmem_swaplist_mutex);
- /* move instead of add in case we're racing */
- list_move_tail(&info->swaplist, &shmem_swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
- iput(inode);
- }
return 0;
}
- shmem_swp_unmap(entry);
-unlock:
- spin_unlock(&info->lock);
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
+ mutex_unlock(&shmem_swaplist_mutex);
swapcache_free(swap, NULL);
redirty:
set_page_dirty(page);
@@ -1146,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
}
#endif /* CONFIG_TMPFS */
-static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
- struct shmem_inode_info *info, unsigned long idx)
+static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
{
struct mempolicy mpol, *spol;
struct vm_area_struct pvma;
- struct page *page;
spol = mpol_cond_copy(&mpol,
- mpol_shared_policy_lookup(&info->policy, idx));
+ mpol_shared_policy_lookup(&info->policy, index));
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
- pvma.vm_pgoff = idx;
+ pvma.vm_pgoff = index;
pvma.vm_ops = NULL;
pvma.vm_policy = spol;
- page = swapin_readahead(entry, gfp, &pvma, 0);
- return page;
+ return swapin_readahead(swap, gfp, &pvma, 0);
}
static struct page *shmem_alloc_page(gfp_t gfp,
- struct shmem_inode_info *info, unsigned long idx)
+ struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
- pvma.vm_pgoff = idx;
+ pvma.vm_pgoff = index;
pvma.vm_ops = NULL;
- pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
/*
* alloc_page_vma() will drop the shared policy reference
@@ -1183,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
}
#endif /* CONFIG_TMPFS */
-static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
- struct shmem_inode_info *info, unsigned long idx)
+static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
{
- return swapin_readahead(entry, gfp, NULL, 0);
+ return swapin_readahead(swap, gfp, NULL, 0);
}
static inline struct page *shmem_alloc_page(gfp_t gfp,
- struct shmem_inode_info *info, unsigned long idx)
+ struct shmem_inode_info *info, pgoff_t index)
{
return alloc_page(gfp);
}
@@ -1209,300 +824,195 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
#endif
/*
- * shmem_getpage - either get the page from swap or allocate a new one
+ * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache
*/
-static int shmem_getpage(struct inode *inode, unsigned long idx,
- struct page **pagep, enum sgp_type sgp, int *type)
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo;
- struct page *filepage = *pagep;
- struct page *swappage;
- struct page *prealloc_page = NULL;
- swp_entry_t *entry;
+ struct page *page;
swp_entry_t swap;
- gfp_t gfp;
int error;
+ int once = 0;
- if (idx >= SHMEM_MAX_INDEX)
+ if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
return -EFBIG;
+repeat:
+ swap.val = 0;
+ page = find_lock_page(mapping, index);
+ if (radix_tree_exceptional_entry(page)) {
+ swap = radix_to_swp_entry(page);
+ page = NULL;
+ }
- if (type)
- *type = 0;
+ if (sgp != SGP_WRITE &&
+ ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ error = -EINVAL;
+ goto failed;
+ }
- /*
- * Normally, filepage is NULL on entry, and either found
- * uptodate immediately, or allocated and zeroed, or read
- * in under swappage, which is then assigned to filepage.
- * But shmem_readpage (required for splice) passes in a locked
- * filepage, which may be found not uptodate by other callers
- * too, and may need to be copied from the swappage read in.
- */
-repeat:
- if (!filepage)
- filepage = find_lock_page(mapping, idx);
- if (filepage && PageUptodate(filepage))
- goto done;
- gfp = mapping_gfp_mask(mapping);
- if (!filepage) {
+ if (page || (sgp == SGP_READ && !swap.val)) {
/*
- * Try to preload while we can wait, to not make a habit of
- * draining atomic reserves; but don't latch on to this cpu.
+ * Once we can get the page lock, it must be uptodate:
+ * if there were an error in reading back from swap,
+ * the page would not be inserted into the filecache.
*/
- error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
- if (error)
- goto failed;
- radix_tree_preload_end();
- if (sgp != SGP_READ && !prealloc_page) {
- /* We don't care if this fails */
- prealloc_page = shmem_alloc_page(gfp, info, idx);
- if (prealloc_page) {
- if (mem_cgroup_cache_charge(prealloc_page,
- current->mm, GFP_KERNEL)) {
- page_cache_release(prealloc_page);
- prealloc_page = NULL;
- }
- }
- }
+ BUG_ON(page && !PageUptodate(page));
+ *pagep = page;
+ return 0;
}
- error = 0;
- spin_lock(&info->lock);
- shmem_recalc_inode(inode);
- entry = shmem_swp_alloc(info, idx, sgp);
- if (IS_ERR(entry)) {
- spin_unlock(&info->lock);
- error = PTR_ERR(entry);
- goto failed;
- }
- swap = *entry;
+ /*
+ * Fast cache lookup did not find it:
+ * bring it back from swap or allocate.
+ */
+ info = SHMEM_I(inode);
+ sbinfo = SHMEM_SB(inode->i_sb);
if (swap.val) {
/* Look it up and read it in.. */
- swappage = lookup_swap_cache(swap);
- if (!swappage) {
- shmem_swp_unmap(entry);
+ page = lookup_swap_cache(swap);
+ if (!page) {
/* here we actually do the io */
- if (type && !(*type & VM_FAULT_MAJOR)) {
- __count_vm_event(PGMAJFAULT);
- *type |= VM_FAULT_MAJOR;
- }
- spin_unlock(&info->lock);
- swappage = shmem_swapin(swap, gfp, info, idx);
- if (!swappage) {
- spin_lock(&info->lock);
- entry = shmem_swp_alloc(info, idx, sgp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- if (entry->val == swap.val)
- error = -ENOMEM;
- shmem_swp_unmap(entry);
- }
- spin_unlock(&info->lock);
- if (error)
- goto failed;
- goto repeat;
+ if (fault_type)
+ *fault_type |= VM_FAULT_MAJOR;
+ page = shmem_swapin(swap, gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto failed;
}
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
- goto repeat;
}
/* We have to do this with page locked to prevent races */
- if (!trylock_page(swappage)) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
- goto repeat;
- }
- if (PageWriteback(swappage)) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- wait_on_page_writeback(swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
- goto repeat;
- }
- if (!PageUptodate(swappage)) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- unlock_page(swappage);
- page_cache_release(swappage);
+ lock_page(page);
+ if (!PageUptodate(page)) {
error = -EIO;
goto failed;
}
-
- if (filepage) {
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(swappage);
- spin_unlock(&info->lock);
- copy_highpage(filepage, swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
- set_page_dirty(filepage);
- swap_free(swap);
- } else if (!(error = add_to_page_cache_locked(swappage, mapping,
- idx, GFP_NOWAIT))) {
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(swappage);
- spin_unlock(&info->lock);
- filepage = swappage;
- set_page_dirty(filepage);
- swap_free(swap);
- } else {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- if (error == -ENOMEM) {
- /*
- * reclaim from proper memory cgroup and
- * call memcg's OOM if needed.
- */
- error = mem_cgroup_shmem_charge_fallback(
- swappage,
- current->mm,
- gfp);
- if (error) {
- unlock_page(swappage);
- page_cache_release(swappage);
- goto failed;
- }
- }
- unlock_page(swappage);
- page_cache_release(swappage);
- goto repeat;
- }
- } else if (sgp == SGP_READ && !filepage) {
- shmem_swp_unmap(entry);
- filepage = find_get_page(mapping, idx);
- if (filepage &&
- (!PageUptodate(filepage) || !trylock_page(filepage))) {
- spin_unlock(&info->lock);
- wait_on_page_locked(filepage);
- page_cache_release(filepage);
- filepage = NULL;
- goto repeat;
+ wait_on_page_writeback(page);
+
+ /* Someone may have already done it for us */
+ if (page->mapping) {
+ if (page->mapping == mapping &&
+ page->index == index)
+ goto done;
+ error = -EEXIST;
+ goto failed;
}
+
+ error = mem_cgroup_cache_charge(page, current->mm,
+ gfp & GFP_RECLAIM_MASK);
+ if (!error)
+ error = shmem_add_to_page_cache(page, mapping, index,
+ gfp, swp_to_radix_entry(swap));
+ if (error)
+ goto failed;
+
+ spin_lock(&info->lock);
+ info->swapped--;
+ shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ swap_free(swap);
+
} else {
- shmem_swp_unmap(entry);
- sbinfo = SHMEM_SB(inode->i_sb);
+ if (shmem_acct_block(info->flags)) {
+ error = -ENOSPC;
+ goto failed;
+ }
if (sbinfo->max_blocks) {
- if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
- shmem_acct_block(info->flags)) {
- spin_unlock(&info->lock);
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks) >= 0) {
error = -ENOSPC;
- goto failed;
+ goto unacct;
}
percpu_counter_inc(&sbinfo->used_blocks);
- spin_lock(&inode->i_lock);
- inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&inode->i_lock);
- } else if (shmem_acct_block(info->flags)) {
- spin_unlock(&info->lock);
- error = -ENOSPC;
- goto failed;
}
- if (!filepage) {
- int ret;
-
- if (!prealloc_page) {
- spin_unlock(&info->lock);
- filepage = shmem_alloc_page(gfp, info, idx);
- if (!filepage) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- error = -ENOMEM;
- goto failed;
- }
- SetPageSwapBacked(filepage);
-
- /*
- * Precharge page while we can wait, compensate
- * after
- */
- error = mem_cgroup_cache_charge(filepage,
- current->mm, GFP_KERNEL);
- if (error) {
- page_cache_release(filepage);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- filepage = NULL;
- goto failed;
- }
-
- spin_lock(&info->lock);
- } else {
- filepage = prealloc_page;
- prealloc_page = NULL;
- SetPageSwapBacked(filepage);
- }
-
- entry = shmem_swp_alloc(info, idx, sgp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- swap = *entry;
- shmem_swp_unmap(entry);
- }
- ret = error || swap.val;
- if (ret)
- mem_cgroup_uncharge_cache_page(filepage);
- else
- ret = add_to_page_cache_lru(filepage, mapping,
- idx, GFP_NOWAIT);
- /*
- * At add_to_page_cache_lru() failure, uncharge will
- * be done automatically.
- */
- if (ret) {
- spin_unlock(&info->lock);
- page_cache_release(filepage);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- filepage = NULL;
- if (error)
- goto failed;
- goto repeat;
- }
- info->flags |= SHMEM_PAGEIN;
+ page = shmem_alloc_page(gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto decused;
}
+ SetPageSwapBacked(page);
+ __set_page_locked(page);
+ error = mem_cgroup_cache_charge(page, current->mm,
+ gfp & GFP_RECLAIM_MASK);
+ if (!error)
+ error = shmem_add_to_page_cache(page, mapping, index,
+ gfp, NULL);
+ if (error)
+ goto decused;
+ lru_cache_add_anon(page);
+
+ spin_lock(&info->lock);
info->alloced++;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
spin_unlock(&info->lock);
- clear_highpage(filepage);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
+
+ clear_highpage(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
if (sgp == SGP_DIRTY)
- set_page_dirty(filepage);
+ set_page_dirty(page);
}
done:
- *pagep = filepage;
- error = 0;
- goto out;
+ /* Perhaps the file has been truncated since we checked */
+ if (sgp != SGP_WRITE &&
+ ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ error = -EINVAL;
+ goto trunc;
+ }
+ *pagep = page;
+ return 0;
+ /*
+ * Error recovery.
+ */
+trunc:
+ ClearPageDirty(page);
+ delete_from_page_cache(page);
+ spin_lock(&info->lock);
+ info->alloced--;
+ inode->i_blocks -= BLOCKS_PER_PAGE;
+ spin_unlock(&info->lock);
+decused:
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -1);
+unacct:
+ shmem_unacct_blocks(info->flags, 1);
failed:
- if (*pagep != filepage) {
- unlock_page(filepage);
- page_cache_release(filepage);
+ if (swap.val && error != -EINVAL) {
+ struct page *test = find_get_page(mapping, index);
+ if (test && !radix_tree_exceptional_entry(test))
+ page_cache_release(test);
+ /* Have another try if the entry has changed */
+ if (test != swp_to_radix_entry(swap))
+ error = -EEXIST;
}
-out:
- if (prealloc_page) {
- mem_cgroup_uncharge_cache_page(prealloc_page);
- page_cache_release(prealloc_page);
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ if (error == -ENOSPC && !once++) {
+ info = SHMEM_I(inode);
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+ goto repeat;
}
+ if (error == -EEXIST)
+ goto repeat;
return error;
}
@@ -1510,33 +1020,34 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
int error;
- int ret;
-
- if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- return VM_FAULT_SIGBUS;
+ int ret = VM_FAULT_LOCKED;
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
- return ret | VM_FAULT_LOCKED;
+ if (ret & VM_FAULT_MAJOR) {
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ }
+ return ret;
}
#ifdef CONFIG_NUMA
-static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
- struct inode *i = vma->vm_file->f_path.dentry->d_inode;
- return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
}
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
unsigned long addr)
{
- struct inode *i = vma->vm_file->f_path.dentry->d_inode;
- unsigned long idx;
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ pgoff_t index;
- idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+ index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}
#endif
@@ -1597,6 +1108,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
spin_lock_init(&info->lock);
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->swaplist);
+ INIT_LIST_HEAD(&info->xattr_list);
cache_no_acl(inode);
switch (mode & S_IFMT) {
@@ -1633,20 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
-static const struct inode_operations shmem_symlink_inline_operations;
-
-/*
- * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
- * but providing them allows a tmpfs file to be used for splice, sendfile, and
- * below the loop driver, in the generic fashion that many filesystems support.
- */
-static int shmem_readpage(struct file *file, struct page *page)
-{
- struct inode *inode = page->mapping->host;
- int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
- unlock_page(page);
- return error;
-}
+static const struct inode_operations shmem_short_symlink_operations;
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1655,7 +1154,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- *pagep = NULL;
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}
@@ -1680,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
{
struct inode *inode = filp->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
- unsigned long index, offset;
+ pgoff_t index;
+ unsigned long offset;
enum sgp_type sgp = SGP_READ;
/*
@@ -1696,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
for (;;) {
struct page *page = NULL;
- unsigned long end_index, nr, ret;
+ pgoff_t end_index;
+ unsigned long nr, ret;
loff_t i_size = i_size_read(inode);
end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1812,6 +1312,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
return retval;
}
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct address_space *mapping = in->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned int loff, nr_pages, req_pages;
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
+ struct page *page;
+ pgoff_t index, end_index;
+ loff_t isize, left;
+ int error, page_nr;
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .flags = flags,
+ .ops = &page_cache_pipe_buf_ops,
+ .spd_release = spd_release_page,
+ };
+
+ isize = i_size_read(inode);
+ if (unlikely(*ppos >= isize))
+ return 0;
+
+ left = isize - *ppos;
+ if (unlikely(left < len))
+ len = left;
+
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ loff = *ppos & ~PAGE_CACHE_MASK;
+ req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ nr_pages = min(req_pages, pipe->buffers);
+
+ spd.nr_pages = find_get_pages_contig(mapping, index,
+ nr_pages, spd.pages);
+ index += spd.nr_pages;
+ error = 0;
+
+ while (spd.nr_pages < nr_pages) {
+ error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+ if (error)
+ break;
+ unlock_page(page);
+ spd.pages[spd.nr_pages++] = page;
+ index++;
+ }
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ nr_pages = spd.nr_pages;
+ spd.nr_pages = 0;
+
+ for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+ unsigned int this_len;
+
+ if (!len)
+ break;
+
+ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ page = spd.pages[page_nr];
+
+ if (!PageUptodate(page) || page->mapping != mapping) {
+ error = shmem_getpage(inode, index, &page,
+ SGP_CACHE, NULL);
+ if (error)
+ break;
+ unlock_page(page);
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
+ }
+
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index))
+ break;
+
+ if (end_index == index) {
+ unsigned int plen;
+
+ plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (plen <= loff)
+ break;
+
+ this_len = min(this_len, plen - loff);
+ len = this_len;
+ }
+
+ spd.partial[page_nr].offset = loff;
+ spd.partial[page_nr].len = this_len;
+ len -= this_len;
+ loff = 0;
+ spd.nr_pages++;
+ index++;
+ }
+
+ while (page_nr < nr_pages)
+ page_cache_release(spd.pages[page_nr++]);
+
+ if (spd.nr_pages)
+ error = splice_to_pipe(pipe, &spd);
+
+ splice_shrink_spd(pipe, &spd);
+
+ if (error > 0) {
+ *ppos += error;
+ file_accessed(in);
+ }
+ return error;
+}
+
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1821,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = NAME_MAX;
if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree =
- sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
+ buf->f_bavail =
+ buf->f_bfree = sbinfo->max_blocks -
+ percpu_counter_sum(&sbinfo->used_blocks);
}
if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
@@ -1843,8 +1457,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
- error = security_inode_init_security(inode, dir, NULL, NULL,
- NULL);
+ error = security_inode_init_security(inode, dir,
+ &dentry->d_name, NULL,
+ NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
iput(inode);
@@ -1971,7 +1586,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
int error;
int len;
struct inode *inode;
- struct page *page = NULL;
+ struct page *page;
char *kaddr;
struct shmem_inode_info *info;
@@ -1983,8 +1598,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
if (!inode)
return -ENOSPC;
- error = security_inode_init_security(inode, dir, NULL, NULL,
- NULL);
+ error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
+ NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
iput(inode);
@@ -1995,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
info = SHMEM_I(inode);
inode->i_size = len-1;
- if (len <= (char *)inode - (char *)info) {
- /* do it inline */
- memcpy(info, symname, len);
- inode->i_op = &shmem_symlink_inline_operations;
+ if (len <= SHORT_SYMLINK_LEN) {
+ info->symlink = kmemdup(symname, len, GFP_KERNEL);
+ if (!info->symlink) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ inode->i_op = &shmem_short_symlink_operations;
} else {
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
if (error) {
@@ -2021,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
return 0;
}
-static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
{
- nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
+ nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
return NULL;
}
static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct page *page = NULL;
- int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
- nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+ int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+ nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
if (page)
unlock_page(page);
return page;
@@ -2047,63 +1665,252 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
}
}
-static const struct inode_operations shmem_symlink_inline_operations = {
- .readlink = generic_readlink,
- .follow_link = shmem_follow_link_inline,
-};
-
-static const struct inode_operations shmem_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = shmem_follow_link,
- .put_link = shmem_put_link,
-};
-
-#ifdef CONFIG_TMPFS_POSIX_ACL
+#ifdef CONFIG_TMPFS_XATTR
/*
- * Superblocks without xattr inode operations will get security.* xattr
- * support from the VFS "for free". As soon as we have any other xattrs
+ * Superblocks without xattr inode operations may get some security.* xattr
+ * support from the LSM "for free". As soon as we have any other xattrs
* like ACLs, we also need to implement the security.* handlers at
* filesystem level, though.
*/
-static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len, int handler_flags)
+static int shmem_xattr_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- return security_inode_listsecurity(dentry->d_inode, list, list_len);
-}
+ struct shmem_inode_info *info;
+ struct shmem_xattr *xattr;
+ int ret = -ENODATA;
-static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int handler_flags)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return xattr_getsecurity(dentry->d_inode, name, buffer, size);
+ info = SHMEM_I(dentry->d_inode);
+
+ spin_lock(&info->lock);
+ list_for_each_entry(xattr, &info->xattr_list, list) {
+ if (strcmp(name, xattr->name))
+ continue;
+
+ ret = xattr->size;
+ if (buffer) {
+ if (size < xattr->size)
+ ret = -ERANGE;
+ else
+ memcpy(buffer, xattr->value, xattr->size);
+ }
+ break;
+ }
+ spin_unlock(&info->lock);
+ return ret;
}
-static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int handler_flags)
+static int shmem_xattr_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return security_inode_setsecurity(dentry->d_inode, name, value,
- size, flags);
-}
+ struct inode *inode = dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_xattr *xattr;
+ struct shmem_xattr *new_xattr = NULL;
+ size_t len;
+ int err = 0;
+
+ /* value == NULL means remove */
+ if (value) {
+ /* wrap around? */
+ len = sizeof(*new_xattr) + size;
+ if (len <= sizeof(*new_xattr))
+ return -ENOMEM;
+
+ new_xattr = kmalloc(len, GFP_KERNEL);
+ if (!new_xattr)
+ return -ENOMEM;
+
+ new_xattr->name = kstrdup(name, GFP_KERNEL);
+ if (!new_xattr->name) {
+ kfree(new_xattr);
+ return -ENOMEM;
+ }
-static const struct xattr_handler shmem_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .list = shmem_xattr_security_list,
- .get = shmem_xattr_security_get,
- .set = shmem_xattr_security_set,
-};
+ new_xattr->size = size;
+ memcpy(new_xattr->value, value, size);
+ }
+
+ spin_lock(&info->lock);
+ list_for_each_entry(xattr, &info->xattr_list, list) {
+ if (!strcmp(name, xattr->name)) {
+ if (flags & XATTR_CREATE) {
+ xattr = new_xattr;
+ err = -EEXIST;
+ } else if (new_xattr) {
+ list_replace(&xattr->list, &new_xattr->list);
+ } else {
+ list_del(&xattr->list);
+ }
+ goto out;
+ }
+ }
+ if (flags & XATTR_REPLACE) {
+ xattr = new_xattr;
+ err = -ENODATA;
+ } else {
+ list_add(&new_xattr->list, &info->xattr_list);
+ xattr = NULL;
+ }
+out:
+ spin_unlock(&info->lock);
+ if (xattr)
+ kfree(xattr->name);
+ kfree(xattr);
+ return err;
+}
static const struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
&generic_acl_access_handler,
&generic_acl_default_handler,
- &shmem_xattr_security_handler,
+#endif
NULL
};
+
+static int shmem_xattr_validate(const char *name)
+{
+ struct { const char *prefix; size_t len; } arr[] = {
+ { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
+ { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++) {
+ size_t preflen = arr[i].len;
+ if (strncmp(name, arr[i].prefix, preflen) == 0) {
+ if (!name[preflen])
+ return -EINVAL;
+ return 0;
+ }
+ }
+ return -EOPNOTSUPP;
+}
+
+static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ int err;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_getxattr(dentry, name, buffer, size);
+
+ err = shmem_xattr_validate(name);
+ if (err)
+ return err;
+
+ return shmem_xattr_get(dentry, name, buffer, size);
+}
+
+static int shmem_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_setxattr(dentry, name, value, size, flags);
+
+ err = shmem_xattr_validate(name);
+ if (err)
+ return err;
+
+ if (size == 0)
+ value = ""; /* empty EA, do not remove */
+
+ return shmem_xattr_set(dentry, name, value, size, flags);
+
+}
+
+static int shmem_removexattr(struct dentry *dentry, const char *name)
+{
+ int err;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_removexattr(dentry, name);
+
+ err = shmem_xattr_validate(name);
+ if (err)
+ return err;
+
+ return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
+}
+
+static bool xattr_is_trusted(const char *name)
+{
+ return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+}
+
+static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ bool trusted = capable(CAP_SYS_ADMIN);
+ struct shmem_xattr *xattr;
+ struct shmem_inode_info *info;
+ size_t used = 0;
+
+ info = SHMEM_I(dentry->d_inode);
+
+ spin_lock(&info->lock);
+ list_for_each_entry(xattr, &info->xattr_list, list) {
+ size_t len;
+
+ /* skip "trusted." attributes for unprivileged callers */
+ if (!trusted && xattr_is_trusted(xattr->name))
+ continue;
+
+ len = strlen(xattr->name) + 1;
+ used += len;
+ if (buffer) {
+ if (size < used) {
+ used = -ERANGE;
+ break;
+ }
+ memcpy(buffer, xattr->name, len);
+ buffer += len;
+ }
+ }
+ spin_unlock(&info->lock);
+
+ return used;
+}
+#endif /* CONFIG_TMPFS_XATTR */
+
+static const struct inode_operations shmem_short_symlink_operations = {
+ .readlink = generic_readlink,
+ .follow_link = shmem_follow_short_symlink,
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
#endif
+};
+
+static const struct inode_operations shmem_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = shmem_follow_link,
+ .put_link = shmem_put_link,
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
+};
static struct dentry *shmem_get_parent(struct dentry *child)
{
@@ -2144,8 +1951,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
{
struct inode *inode = dentry->d_inode;
- if (*len < 3)
+ if (*len < 3) {
+ *len = 3;
return 255;
+ }
if (inode_unhashed(inode)) {
/* Unfortunately insert_inode_hash is not idempotent,
@@ -2282,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
if (config.max_inodes < inodes)
goto out;
/*
- * Those tests also disallow limited->unlimited while any are in
- * use, so i_blocks will always be zero when max_blocks is zero;
+ * Those tests disallow limited->unlimited while any are in use;
* but we must separately disallow unlimited->limited, because
* in that case we have no record of how much is already in use.
*/
@@ -2375,14 +2183,16 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
goto failed;
sbinfo->free_inodes = sbinfo->max_inodes;
- sb->s_maxbytes = SHMEM_MAX_BYTES;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
sb->s_time_gran = 1;
-#ifdef CONFIG_TMPFS_POSIX_ACL
+#ifdef CONFIG_TMPFS_XATTR
sb->s_xattr = shmem_xattr_handlers;
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
@@ -2408,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
- struct shmem_inode_info *p;
- p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
- if (!p)
+ struct shmem_inode_info *info;
+ info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+ if (!info)
return NULL;
- return &p->vfs_inode;
+ return &info->vfs_inode;
}
-static void shmem_i_callback(struct rcu_head *head)
+static void shmem_destroy_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
INIT_LIST_HEAD(&inode->i_dentry);
@@ -2424,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head)
static void shmem_destroy_inode(struct inode *inode)
{
- if ((inode->i_mode & S_IFMT) == S_IFREG) {
- /* only struct inode is valid if it's an inline symlink */
+ if ((inode->i_mode & S_IFMT) == S_IFREG)
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
- }
- call_rcu(&inode->i_rcu, shmem_i_callback);
+ call_rcu(&inode->i_rcu, shmem_destroy_callback);
}
-static void init_once(void *foo)
+static void shmem_init_inode(void *foo)
{
- struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
-
- inode_init_once(&p->vfs_inode);
+ struct shmem_inode_info *info = foo;
+ inode_init_once(&info->vfs_inode);
}
-static int init_inodecache(void)
+static int shmem_init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
- 0, SLAB_PANIC, init_once);
+ 0, SLAB_PANIC, shmem_init_inode);
return 0;
}
-static void destroy_inodecache(void)
+static void shmem_destroy_inodecache(void)
{
kmem_cache_destroy(shmem_inode_cachep);
}
@@ -2455,7 +2262,6 @@ static const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
- .readpage = shmem_readpage,
.write_begin = shmem_write_begin,
.write_end = shmem_write_end,
#endif
@@ -2472,22 +2278,20 @@ static const struct file_operations shmem_file_operations = {
.aio_read = shmem_file_aio_read,
.aio_write = generic_file_aio_write,
.fsync = noop_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = shmem_file_splice_read,
.splice_write = generic_file_splice_write,
#endif
};
static const struct inode_operations shmem_inode_operations = {
- .setattr = shmem_notify_change,
+ .setattr = shmem_setattr,
.truncate_range = shmem_truncate_range,
-#ifdef CONFIG_TMPFS_POSIX_ACL
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = generic_listxattr,
- .removexattr = generic_removexattr,
- .check_acl = generic_check_acl,
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
#endif
-
};
static const struct inode_operations shmem_dir_inode_operations = {
@@ -2502,24 +2306,26 @@ static const struct inode_operations shmem_dir_inode_operations = {
.mknod = shmem_mknod,
.rename = shmem_rename,
#endif
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
- .setattr = shmem_notify_change,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = generic_listxattr,
- .removexattr = generic_removexattr,
- .check_acl = generic_check_acl,
+ .setattr = shmem_setattr,
#endif
};
static const struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
- .setattr = shmem_notify_change,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = generic_listxattr,
- .removexattr = generic_removexattr,
- .check_acl = generic_check_acl,
+ .setattr = shmem_setattr,
#endif
};
@@ -2544,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
#endif
};
-
static struct dentry *shmem_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_nodev(fs_type, flags, data, shmem_fill_super);
}
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
.mount = shmem_mount,
.kill_sb = kill_litter_super,
};
-int __init init_tmpfs(void)
+int __init shmem_init(void)
{
int error;
@@ -2566,18 +2371,18 @@ int __init init_tmpfs(void)
if (error)
goto out4;
- error = init_inodecache();
+ error = shmem_init_inodecache();
if (error)
goto out3;
- error = register_filesystem(&tmpfs_fs_type);
+ error = register_filesystem(&shmem_fs_type);
if (error) {
printk(KERN_ERR "Could not register tmpfs\n");
goto out2;
}
- shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
- tmpfs_fs_type.name, NULL);
+ shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
+ shmem_fs_type.name, NULL);
if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt);
printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2586,9 +2391,9 @@ int __init init_tmpfs(void)
return 0;
out1:
- unregister_filesystem(&tmpfs_fs_type);
+ unregister_filesystem(&shmem_fs_type);
out2:
- destroy_inodecache();
+ shmem_destroy_inodecache();
out3:
bdi_destroy(&shmem_backing_dev_info);
out4:
@@ -2596,45 +2401,6 @@ out4:
return error;
}
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
- struct page **pagep, swp_entry_t *ent)
-{
- swp_entry_t entry = { .val = 0 }, *ptr;
- struct page *page = NULL;
- struct shmem_inode_info *info = SHMEM_I(inode);
-
- if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- goto out;
-
- spin_lock(&info->lock);
- ptr = shmem_swp_entry(info, pgoff, NULL);
-#ifdef CONFIG_SWAP
- if (ptr && ptr->val) {
- entry.val = ptr->val;
- page = find_get_page(&swapper_space, entry.val);
- } else
-#endif
- page = find_get_page(inode->i_mapping, pgoff);
- if (ptr)
- shmem_swp_unmap(ptr);
- spin_unlock(&info->lock);
-out:
- *pagep = page;
- *ent = entry;
-}
-#endif
-
#else /* !CONFIG_SHMEM */
/*
@@ -2648,23 +2414,23 @@ out:
#include <linux/ramfs.h>
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.mount = ramfs_mount,
.kill_sb = kill_litter_super,
};
-int __init init_tmpfs(void)
+int __init shmem_init(void)
{
- BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+ BUG_ON(register_filesystem(&shmem_fs_type) != 0);
- shm_mnt = kern_mount(&tmpfs_fs_type);
+ shm_mnt = kern_mount(&shmem_fs_type);
BUG_ON(IS_ERR(shm_mnt));
return 0;
}
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
{
return 0;
}
@@ -2674,37 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
return 0;
}
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
- struct page **pagep, swp_entry_t *ent)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
- struct page *page = NULL;
-
- if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- goto out;
- page = find_get_page(inode->i_mapping, pgoff);
-out:
- *pagep = page;
- *ent = (swp_entry_t){ .val = 0 };
+ truncate_inode_pages_range(inode->i_mapping, lstart, lend);
}
-#endif
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
#define shmem_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
-#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
#endif /* CONFIG_SHMEM */
@@ -2728,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
if (IS_ERR(shm_mnt))
return (void *)shm_mnt;
- if (size < 0 || size > SHMEM_MAX_BYTES)
+ if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
if (shmem_acct_size(flags, size))
@@ -2791,5 +2537,45 @@ int shmem_zero_setup(struct vm_area_struct *vma)
fput(vma->vm_file);
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
+
+/**
+ * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
+ * with any new page allocations done using the specified allocation flags.
+ * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * suit tmpfs, since it may have pages in swapcache, and needs to find those
+ * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
+ *
+ * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
+ * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
+ */
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+#ifdef CONFIG_SHMEM
+ struct inode *inode = mapping->host;
+ struct page *page;
+ int error;
+
+ BUG_ON(mapping->a_ops != &shmem_aops);
+ error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+ if (error)
+ page = ERR_PTR(error);
+ else
+ unlock_page(page);
+ return page;
+#else
+ /*
+ * The tiny !SHMEM case uses ramfs without swap
+ */
+ return read_cache_page_gfp(mapping, index, gfp);
+#endif
+}
+EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index 37961d1f584..6d90a091fdc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
#include <linux/debugobjects.h>
#include <linux/kmemcheck.h>
#include <linux/memory.h>
+#include <linux/prefetch.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -191,22 +192,6 @@ typedef unsigned int kmem_bufctl_t;
#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
/*
- * struct slab
- *
- * Manages the objs in a slab. Placed either at the beginning of mem allocated
- * for a slab, or allocated from an general cache.
- * Slabs are chained into three list: fully used, partial, fully free slabs.
- */
-struct slab {
- struct list_head list;
- unsigned long colouroff;
- void *s_mem; /* including colour offset */
- unsigned int inuse; /* num of objs active in slab */
- kmem_bufctl_t free;
- unsigned short nodeid;
-};
-
-/*
* struct slab_rcu
*
* slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
@@ -219,8 +204,6 @@ struct slab {
*
* rcu_read_lock before reading the address, then rcu_read_unlock after
* taking the spinlock within the structure expected at that address.
- *
- * We assume struct slab_rcu can overlay struct slab when destroying.
*/
struct slab_rcu {
struct rcu_head head;
@@ -229,6 +212,27 @@ struct slab_rcu {
};
/*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+ union {
+ struct {
+ struct list_head list;
+ unsigned long colouroff;
+ void *s_mem; /* including colour offset */
+ unsigned int inuse; /* num of objs active in slab */
+ kmem_bufctl_t free;
+ unsigned short nodeid;
+ };
+ struct slab_rcu __slab_cover_slab_rcu;
+ };
+};
+
+/*
* struct array_cache
*
* Purpose:
@@ -570,7 +574,9 @@ static struct arraycache_init initarray_generic =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
/* internal cache of cache description objs */
+static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
static struct kmem_cache cache_cache = {
+ .nodelists = cache_cache_nodelists,
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
@@ -616,6 +622,51 @@ int slab_is_available(void)
static struct lock_class_key on_slab_l3_key;
static struct lock_class_key on_slab_alc_key;
+static struct lock_class_key debugobj_l3_key;
+static struct lock_class_key debugobj_alc_key;
+
+static void slab_set_lock_classes(struct kmem_cache *cachep,
+ struct lock_class_key *l3_key, struct lock_class_key *alc_key,
+ int q)
+{
+ struct array_cache **alc;
+ struct kmem_list3 *l3;
+ int r;
+
+ l3 = cachep->nodelists[q];
+ if (!l3)
+ return;
+
+ lockdep_set_class(&l3->list_lock, l3_key);
+ alc = l3->alien;
+ /*
+ * FIXME: This check for BAD_ALIEN_MAGIC
+ * should go away when common slab code is taught to
+ * work even without alien caches.
+ * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+ * for alloc_alien_cache,
+ */
+ if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+ return;
+ for_each_node(r) {
+ if (alc[r])
+ lockdep_set_class(&alc[r]->lock, alc_key);
+ }
+}
+
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+ slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
+}
+
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+ int node;
+
+ for_each_online_node(node)
+ slab_set_debugobj_lock_classes_node(cachep, node);
+}
+
static void init_node_lock_keys(int q)
{
struct cache_sizes *s = malloc_sizes;
@@ -624,29 +675,14 @@ static void init_node_lock_keys(int q)
return;
for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
- struct array_cache **alc;
struct kmem_list3 *l3;
- int r;
l3 = s->cs_cachep->nodelists[q];
if (!l3 || OFF_SLAB(s->cs_cachep))
continue;
- lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
- alc = l3->alien;
- /*
- * FIXME: This check for BAD_ALIEN_MAGIC
- * should go away when common slab code is taught to
- * work even without alien caches.
- * Currently, non NUMA code returns BAD_ALIEN_MAGIC
- * for alloc_alien_cache,
- */
- if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
- continue;
- for_each_node(r) {
- if (alc[r])
- lockdep_set_class(&alc[r]->lock,
- &on_slab_alc_key);
- }
+
+ slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
+ &on_slab_alc_key, q);
}
}
@@ -665,6 +701,14 @@ static void init_node_lock_keys(int q)
static inline void init_lock_keys(void)
{
}
+
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
+
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+}
#endif
/*
@@ -875,7 +919,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
nc = kmalloc_node(memsize, gfp, node);
/*
* The array_cache structures contain pointers to free object.
- * However, when such objects are allocated or transfered to another
+ * However, when such objects are allocated or transferred to another
* cache the pointers are not cleared and they could be counted as
* valid references during a kmemleak scan. Therefore, kmemleak must
* not scan such objects.
@@ -1258,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu)
spin_unlock_irq(&l3->list_lock);
kfree(shared);
free_alien_cache(alien);
+ if (cachep->flags & SLAB_DEBUG_OBJECTS)
+ slab_set_debugobj_lock_classes_node(cachep, node);
}
init_node_lock_keys(node);
@@ -1387,7 +1433,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
break;
}
out:
- return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+ return notifier_from_errno(ret);
}
#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
@@ -1488,11 +1534,10 @@ void __init kmem_cache_init(void)
cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
/*
- * struct kmem_cache size depends on nr_node_ids, which
- * can be less than MAX_NUMNODES.
+ * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
- cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
- nr_node_ids * sizeof(struct kmem_list3 *);
+ cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+ nr_node_ids * sizeof(struct kmem_list3 *);
#if DEBUG
cache_cache.obj_size = cache_cache.buffer_size;
#endif
@@ -1621,6 +1666,9 @@ void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
+ /* Annotate slab for lockdep -- annotate the malloc caches */
+ init_lock_keys();
+
/* 6) resize the head arrays to their final sizes */
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next)
@@ -1631,9 +1679,6 @@ void __init kmem_cache_init_late(void)
/* Done! */
g_cpucache_up = FULL;
- /* Annotate slab for lockdep -- annotate the malloc caches */
- init_lock_keys();
-
/*
* Register a cpu startup notifier callback that initializes
* cpu_cache_get for all new cpus
@@ -2147,8 +2192,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
*
* @name must be valid until the cache is destroyed. This implies that
* the module calling this has to destroy the cache before getting unloaded.
- * Note that kmem_cache_name() is not guaranteed to return the same pointer,
- * therefore applications must manage it themselves.
*
* The flags are
*
@@ -2288,8 +2331,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (ralign < align) {
ralign = align;
}
- /* disable debug if not aligning with REDZONE_ALIGN */
- if (ralign & (__alignof__(unsigned long long) - 1))
+ /* disable debug if necessary */
+ if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
@@ -2306,6 +2349,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (!cachep)
goto oops;
+ cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
#if DEBUG
cachep->obj_size = size;
@@ -2315,8 +2359,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
if (flags & SLAB_RED_ZONE) {
/* add space for red zone words */
- cachep->obj_offset += align;
- size += align + sizeof(unsigned long long);
+ cachep->obj_offset += sizeof(unsigned long long);
+ size += 2 * sizeof(unsigned long long);
}
if (flags & SLAB_STORE_USER) {
/* user store requires one word storage behind the end of
@@ -2422,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
goto oops;
}
+ if (flags & SLAB_DEBUG_OBJECTS) {
+ /*
+ * Would deadlock through slab_destroy()->call_rcu()->
+ * debug_object_activate()->kmem_cache_alloc().
+ */
+ WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
+
+ slab_set_debugobj_lock_classes(cachep);
+ }
+
/* cache setup completed, link it into the list */
list_add(&cachep->next, &cache_chain);
oops:
@@ -2605,7 +2659,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
*
* The cache must be empty before calling this function.
*
- * The caller must guarantee that noone will allocate memory from the cache
+ * The caller must guarantee that no one will allocate memory from the cache
* during the kmem_cache_destroy().
*/
void kmem_cache_destroy(struct kmem_cache *cachep)
@@ -3151,12 +3205,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
-#if ARCH_SLAB_MINALIGN
- if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+ if (ARCH_SLAB_MINALIGN &&
+ ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
- objp, ARCH_SLAB_MINALIGN);
+ objp, (int)ARCH_SLAB_MINALIGN);
}
-#endif
return objp;
}
#else
@@ -3400,7 +3453,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- if (nodeid == -1)
+ if (nodeid == NUMA_NO_NODE)
nodeid = slab_node;
if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3602,13 +3655,14 @@ free_done:
* Release an obj back to its cache. If the obj has a constructed state, it must
* be in this state _before_ it is released. Called with disabled ints.
*/
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp,
+ void *caller)
{
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
- objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+ objp = cache_free_debugcheck(cachep, objp, caller);
kmemcheck_slab_free(cachep, objp, obj_size(cachep));
@@ -3799,7 +3853,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
debug_check_no_locks_freed(objp, obj_size(cachep));
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, obj_size(cachep));
- __cache_free(cachep, objp);
+ __cache_free(cachep, objp, __builtin_return_address(0));
local_irq_restore(flags);
trace_kmem_cache_free(_RET_IP_, objp);
@@ -3829,7 +3883,7 @@ void kfree(const void *objp)
c = virt_to_cache(objp);
debug_check_no_locks_freed(objp, obj_size(c));
debug_check_no_obj_freed(objp, obj_size(c));
- __cache_free(c, (void *)objp);
+ __cache_free(c, (void *)objp, __builtin_return_address(0));
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);
@@ -3840,12 +3894,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *cachep)
-{
- return cachep->name;
-}
-EXPORT_SYMBOL_GPL(kmem_cache_name);
-
/*
* This initializes kmem_list3 or resizes various caches for all nodes.
*/
@@ -3936,7 +3984,7 @@ fail:
struct ccupdate_struct {
struct kmem_cache *cachep;
- struct array_cache *new[NR_CPUS];
+ struct array_cache *new[0];
};
static void do_ccupdate_local(void *info)
@@ -3958,7 +4006,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
struct ccupdate_struct *new;
int i;
- new = kzalloc(sizeof(*new), gfp);
+ new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
+ gfp);
if (!new)
return -ENOMEM;
diff --git a/mm/slob.c b/mm/slob.c
index 3588eaaef72..bf391818716 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -70,7 +70,7 @@
#include <trace/events/kmem.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
/*
* slob_block has a field 'units', which indicates size of block if +ve,
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
void *ret;
+ gfp &= gfp_allowed_mask;
+
lockdep_trace_alloc(gfp);
if (size < PAGE_SIZE - align) {
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
void *b;
+ flags &= gfp_allowed_mask;
+
+ lockdep_trace_alloc(flags);
+
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node);
trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
@@ -666,12 +672,6 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
}
EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *c)
-{
- return c->name;
-}
-EXPORT_SYMBOL(kmem_cache_name);
-
int kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
diff --git a/mm/slub.c b/mm/slub.c
index e15aa7f193c..9f662d70eb4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2,10 +2,11 @@
* SLUB: A slab allocator that limits cache line use instead of queuing
* objects in per cpu and per node lists.
*
- * The allocator synchronizes using per slab locks and only
- * uses a centralized lock to manage a pool of partial slabs.
+ * The allocator synchronizes using per slab locks or atomic operatios
+ * and only uses a centralized lock to manage a pool of partial slabs.
*
* (C) 2007 SGI, Christoph Lameter
+ * (C) 2011 Linux Foundation, Christoph Lameter
*/
#include <linux/mm.h>
@@ -27,20 +28,33 @@
#include <linux/memory.h>
#include <linux/math64.h>
#include <linux/fault-inject.h>
+#include <linux/stacktrace.h>
#include <trace/events/kmem.h>
/*
* Lock order:
- * 1. slab_lock(page)
- * 2. slab->list_lock
+ * 1. slub_lock (Global Semaphore)
+ * 2. node->list_lock
+ * 3. slab_lock(page) (Only on some arches and for debugging)
*
- * The slab_lock protects operations on the object of a particular
- * slab and its metadata in the page struct. If the slab lock
- * has been taken then no allocations nor frees can be performed
- * on the objects in the slab nor can the slab be added or removed
- * from the partial or full lists since this would mean modifying
- * the page_struct of the slab.
+ * slub_lock
+ *
+ * The role of the slub_lock is to protect the list of all the slabs
+ * and to synchronize major metadata changes to slab cache structures.
+ *
+ * The slab_lock is only used for debugging and on arches that do not
+ * have the ability to do a cmpxchg_double. It only protects the second
+ * double word in the page struct. Meaning
+ * A. page->freelist -> List of object free in a page
+ * B. page->counters -> Counters of objects
+ * C. page->frozen -> frozen state
+ *
+ * If a slab is frozen then it is exempt from list management. It is not
+ * on any list. The processor that froze the slab is the one who can
+ * perform list operations on the page. Other processors may put objects
+ * onto the freelist but the processor that froze the slab is the only
+ * one that can retrieve the objects from the page's freelist.
*
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
@@ -53,20 +67,6 @@
* slabs, operations can continue without any centralized lock. F.e.
* allocating a long series of objects that fill up slabs does not require
* the list lock.
- *
- * The lock order is sometimes inverted when we are trying to get a slab
- * off a list. We take the list_lock and then look for a page on the list
- * to use. While we do that objects in the slabs may be freed. We can
- * only operate on the slab if we have also taken the slab_lock. So we use
- * a slab_trylock() on the slab. If trylock was successful then no frees
- * can occur anymore and we can use the slab for allocations etc. If the
- * slab_trylock() does not succeed then frees are in progress in the slab and
- * we must stay away from it for a while since we may cause a bouncing
- * cacheline if we try to acquire the lock. So go onto the next slab.
- * If all pages are busy then we may allocate a new slab instead of reusing
- * a partial slab. A new slab has noone operating on it and thus there is
- * no danger of cacheline contention.
- *
* Interrupts are disabled during allocation and deallocation in order to
* make the slab allocator safe to use in the context of an irq. In addition
* interrupts are disabled to ensure that the processor does not change
@@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
/* Enable to test recovery from slab corruption on boot */
#undef SLUB_RESILIENCY_TEST
+/* Enable to log cmpxchg failures */
+#undef SLUB_DEBUG_CMPXCHG
+
/*
* Mininum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -166,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#define OO_SHIFT 16
#define OO_MASK ((1 << OO_SHIFT) - 1)
-#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */
+#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000UL /* Poison object */
+#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
static int kmem_size = sizeof(struct kmem_cache);
@@ -191,8 +195,12 @@ static LIST_HEAD(slab_caches);
/*
* Tracking user of a slab.
*/
+#define TRACK_ADDRS_COUNT 16
struct track {
unsigned long addr; /* Called from address */
+#ifdef CONFIG_STACKTRACE
+ unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
+#endif
int cpu; /* Was running on cpu */
int pid; /* Pid context */
unsigned long when; /* When did the operation occur */
@@ -217,7 +225,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
#endif
-static inline void stat(struct kmem_cache *s, enum stat_item si)
+static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
__this_cpu_inc(s->cpu_slab->stat[si]);
@@ -261,6 +269,18 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
return *(void **)(object + s->offset);
}
+static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
+{
+ void *p;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
+#else
+ p = get_freepointer(s, object);
+#endif
+ return p;
+}
+
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
*(void **)(object + s->offset) = fp;
@@ -271,21 +291,46 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
__p += (__s)->size)
-/* Scan freelist */
-#define for_each_free_object(__p, __s, __free) \
- for (__p = (__free); __p; __p = get_freepointer((__s), __p))
-
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
{
return (p - addr) / s->size;
}
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->objsize;
+
+#endif
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+}
+
+static inline int order_objects(int order, unsigned long size, int reserved)
+{
+ return ((PAGE_SIZE << order) - reserved) / size;
+}
+
static inline struct kmem_cache_order_objects oo_make(int order,
- unsigned long size)
+ unsigned long size, int reserved)
{
struct kmem_cache_order_objects x = {
- (order << OO_SHIFT) + (PAGE_SIZE << order) / size
+ (order << OO_SHIFT) + order_objects(order, size, reserved)
};
return x;
@@ -301,8 +346,111 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
return x.x & OO_MASK;
}
+/*
+ * Per slab locking using the pagelock
+ */
+static __always_inline void slab_lock(struct page *page)
+{
+ bit_spin_lock(PG_locked, &page->flags);
+}
+
+static __always_inline void slab_unlock(struct page *page)
+{
+ __bit_spin_unlock(PG_locked, &page->flags);
+}
+
+/* Interrupts must be disabled (for the fallback code to work right) */
+static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new,
+ const char *n)
+{
+ VM_BUG_ON(!irqs_disabled());
+#ifdef CONFIG_CMPXCHG_DOUBLE
+ if (s->flags & __CMPXCHG_DOUBLE) {
+ if (cmpxchg_double(&page->freelist,
+ freelist_old, counters_old,
+ freelist_new, counters_new))
+ return 1;
+ } else
+#endif
+ {
+ slab_lock(page);
+ if (page->freelist == freelist_old && page->counters == counters_old) {
+ page->freelist = freelist_new;
+ page->counters = counters_new;
+ slab_unlock(page);
+ return 1;
+ }
+ slab_unlock(page);
+ }
+
+ cpu_relax();
+ stat(s, CMPXCHG_DOUBLE_FAIL);
+
+#ifdef SLUB_DEBUG_CMPXCHG
+ printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+#endif
+
+ return 0;
+}
+
+static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new,
+ const char *n)
+{
+#ifdef CONFIG_CMPXCHG_DOUBLE
+ if (s->flags & __CMPXCHG_DOUBLE) {
+ if (cmpxchg_double(&page->freelist,
+ freelist_old, counters_old,
+ freelist_new, counters_new))
+ return 1;
+ } else
+#endif
+ {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ slab_lock(page);
+ if (page->freelist == freelist_old && page->counters == counters_old) {
+ page->freelist = freelist_new;
+ page->counters = counters_new;
+ slab_unlock(page);
+ local_irq_restore(flags);
+ return 1;
+ }
+ slab_unlock(page);
+ local_irq_restore(flags);
+ }
+
+ cpu_relax();
+ stat(s, CMPXCHG_DOUBLE_FAIL);
+
+#ifdef SLUB_DEBUG_CMPXCHG
+ printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+#endif
+
+ return 0;
+}
+
#ifdef CONFIG_SLUB_DEBUG
/*
+ * Determine a map of object in use on a page.
+ *
+ * Node listlock must be held to guarantee that the page does
+ * not vanish from under us.
+ */
+static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
+{
+ void *p;
+ void *addr = page_address(page);
+
+ for (p = page->freelist; p; p = get_freepointer(s, p))
+ set_bit(slab_index(p, s, addr), map);
+}
+
+/*
* Debug settings:
*/
#ifdef CONFIG_SLUB_DEBUG_ON
@@ -368,6 +516,24 @@ static void set_track(struct kmem_cache *s, void *object,
struct track *p = get_track(s, object, alloc);
if (addr) {
+#ifdef CONFIG_STACKTRACE
+ struct stack_trace trace;
+ int i;
+
+ trace.nr_entries = 0;
+ trace.max_entries = TRACK_ADDRS_COUNT;
+ trace.entries = p->addrs;
+ trace.skip = 3;
+ save_stack_trace(&trace);
+
+ /* See rant in lockdep.c */
+ if (trace.nr_entries != 0 &&
+ trace.entries[trace.nr_entries - 1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
+ p->addrs[i] = 0;
+#endif
p->addr = addr;
p->cpu = smp_processor_id();
p->pid = current->pid;
@@ -392,6 +558,16 @@ static void print_track(const char *s, struct track *t)
printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
+#ifdef CONFIG_STACKTRACE
+ {
+ int i;
+ for (i = 0; i < TRACK_ADDRS_COUNT; i++)
+ if (t->addrs[i])
+ printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
+ else
+ break;
+ }
+#endif
}
static void print_tracking(struct kmem_cache *s, void *object)
@@ -505,10 +681,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
memset(p + s->objsize, val, s->inuse - s->objsize);
}
-static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
+static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
{
while (bytes) {
- if (*start != (u8)value)
+ if (*start != value)
return start;
start++;
bytes--;
@@ -516,6 +692,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
return NULL;
}
+static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
+{
+ u64 value64;
+ unsigned int words, prefix;
+
+ if (bytes <= 16)
+ return check_bytes8(start, value, bytes);
+
+ value64 = value | value << 8 | value << 16 | value << 24;
+ value64 = (value64 & 0xffffffff) | value64 << 32;
+ prefix = 8 - ((unsigned long)start) % 8;
+
+ if (prefix) {
+ u8 *r = check_bytes8(start, value, prefix);
+ if (r)
+ return r;
+ start += prefix;
+ bytes -= prefix;
+ }
+
+ words = bytes / 8;
+
+ while (words) {
+ if (*(u64 *)start != value64)
+ return check_bytes8(start, value, 8);
+ start += 8;
+ words--;
+ }
+
+ return check_bytes8(start, value, bytes % 8);
+}
+
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
void *from, void *to)
{
@@ -617,7 +825,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
return 1;
start = page_address(page);
- length = (PAGE_SIZE << compound_order(page));
+ length = (PAGE_SIZE << compound_order(page)) - s->reserved;
end = start + length;
remainder = length % s->size;
if (!remainder)
@@ -698,7 +906,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
return 0;
}
- maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+ maxobj = order_objects(compound_order(page), s->size, s->reserved);
if (page->objects > maxobj) {
slab_err(s, page, "objects %u > max %u",
s->name, page->objects, maxobj);
@@ -721,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page)
static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
{
int nr = 0;
- void *fp = page->freelist;
+ void *fp;
void *object = NULL;
unsigned long max_objects;
+ fp = page->freelist;
while (fp && nr <= page->objects) {
if (fp == search)
return 1;
@@ -748,7 +957,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
nr++;
}
- max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+ max_objects = order_objects(compound_order(page), s->size, s->reserved);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -800,45 +1009,56 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
{
flags &= gfp_allowed_mask;
- kmemcheck_slab_alloc(s, flags, object, s->objsize);
+ kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
}
static inline void slab_free_hook(struct kmem_cache *s, void *x)
{
kmemleak_free_recursive(x, s->flags);
-}
-static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
-{
- kmemcheck_slab_free(s, object, s->objsize);
- debug_check_no_locks_freed(object, s->objsize);
+ /*
+ * Trouble is that we may no longer disable interupts in the fast path
+ * So in order to make the debug calls that expect irqs to be
+ * disabled we need to disable interrupts temporarily.
+ */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+ {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kmemcheck_slab_free(s, x, s->objsize);
+ debug_check_no_locks_freed(x, s->objsize);
+ local_irq_restore(flags);
+ }
+#endif
if (!(s->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(object, s->objsize);
+ debug_check_no_obj_freed(x, s->objsize);
}
/*
* Tracking of fully allocated slabs for debugging purposes.
+ *
+ * list_lock must be held.
*/
-static void add_full(struct kmem_cache_node *n, struct page *page)
+static void add_full(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page)
{
- spin_lock(&n->list_lock);
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
list_add(&page->lru, &n->full);
- spin_unlock(&n->list_lock);
}
+/*
+ * list_lock must be held.
+ */
static void remove_full(struct kmem_cache *s, struct page *page)
{
- struct kmem_cache_node *n;
-
if (!(s->flags & SLAB_STORE_USER))
return;
- n = get_node(s, page_to_nid(page));
-
- spin_lock(&n->list_lock);
list_del(&page->lru);
- spin_unlock(&n->list_lock);
}
/* Tracking of the number of slabs for debugging purposes */
@@ -894,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa
if (!check_slab(s, page))
goto bad;
- if (!on_freelist(s, page, object)) {
- object_err(s, page, object, "Object already allocated");
- goto bad;
- }
-
if (!check_valid_pointer(s, page, object)) {
object_err(s, page, object, "Freelist Pointer check fails");
goto bad;
@@ -931,6 +1146,12 @@ bad:
static noinline int free_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr)
{
+ unsigned long flags;
+ int rc = 0;
+
+ local_irq_save(flags);
+ slab_lock(page);
+
if (!check_slab(s, page))
goto fail;
@@ -945,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s,
}
if (!check_object(s, page, object, SLUB_RED_ACTIVE))
- return 0;
+ goto out;
if (unlikely(s != page->slab)) {
if (!PageSlab(page)) {
@@ -962,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
goto fail;
}
- /* Special debug activities for freeing objects */
- if (!PageSlubFrozen(page) && !page->freelist)
- remove_full(s, page);
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
trace(s, page, object, 0);
init_object(s, object, SLUB_RED_INACTIVE);
- return 1;
+ rc = 1;
+out:
+ slab_unlock(page);
+ local_irq_restore(flags);
+ return rc;
fail:
slab_fix(s, "Object at 0x%p not freed", object);
- return 0;
+ goto out;
}
static int __init setup_slub_debug(char *str)
@@ -1073,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
static inline int check_object(struct kmem_cache *s, struct page *page,
void *object, u8 val) { return 1; }
-static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
+static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
+ struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct page *page) {}
static inline unsigned long kmem_cache_flags(unsigned long objsize,
unsigned long flags, const char *name,
void (*ctor)(void *))
@@ -1101,9 +1325,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
-static inline void slab_free_hook_irq(struct kmem_cache *s,
- void *object) {}
-
#endif /* CONFIG_SLUB_DEBUG */
/*
@@ -1128,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
+ flags &= gfp_allowed_mask;
+
+ if (flags & __GFP_WAIT)
+ local_irq_enable();
+
flags |= s->allocflags;
/*
@@ -1144,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* Try a lower order alloc if possible
*/
page = alloc_slab_page(flags, node, oo);
- if (!page)
- return NULL;
- stat(s, ORDER_FALLBACK);
+ if (page)
+ stat(s, ORDER_FALLBACK);
}
+ if (flags & __GFP_WAIT)
+ local_irq_disable();
+
+ if (!page)
+ return NULL;
+
if (kmemcheck_enabled
&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
int pages = 1 << oo_order(oo);
@@ -1217,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
page->freelist = start;
page->inuse = 0;
+ page->frozen = 1;
out:
return page;
}
@@ -1249,21 +1481,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__free_pages(page, order);
}
+#define need_reserve_slab_rcu \
+ (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
+
static void rcu_free_slab(struct rcu_head *h)
{
struct page *page;
- page = container_of((struct list_head *)h, struct page, lru);
+ if (need_reserve_slab_rcu)
+ page = virt_to_head_page(h);
+ else
+ page = container_of((struct list_head *)h, struct page, lru);
+
__free_slab(page->slab, page);
}
static void free_slab(struct kmem_cache *s, struct page *page)
{
if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
- /*
- * RCU free overloads the RCU head over the LRU
- */
- struct rcu_head *head = (void *)&page->lru;
+ struct rcu_head *head;
+
+ if (need_reserve_slab_rcu) {
+ int order = compound_order(page);
+ int offset = (PAGE_SIZE << order) - s->reserved;
+
+ VM_BUG_ON(s->reserved != sizeof(*head));
+ head = page_address(page) + offset;
+ } else {
+ /*
+ * RCU free overloads the RCU head over the LRU
+ */
+ head = (void *)&page->lru;
+ }
call_rcu(head, rcu_free_slab);
} else
@@ -1277,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
}
/*
- * Per slab locking using the pagelock
- */
-static __always_inline void slab_lock(struct page *page)
-{
- bit_spin_lock(PG_locked, &page->flags);
-}
-
-static __always_inline void slab_unlock(struct page *page)
-{
- __bit_spin_unlock(PG_locked, &page->flags);
-}
-
-static __always_inline int slab_trylock(struct page *page)
-{
- int rc = 1;
-
- rc = bit_spin_trylock(PG_locked, &page->flags);
- return rc;
-}
-
-/*
- * Management of partially allocated slabs
+ * Management of partially allocated slabs.
+ *
+ * list_lock must be held.
*/
-static void add_partial(struct kmem_cache_node *n,
+static inline void add_partial(struct kmem_cache_node *n,
struct page *page, int tail)
{
- spin_lock(&n->list_lock);
n->nr_partial++;
if (tail)
list_add_tail(&page->lru, &n->partial);
else
list_add(&page->lru, &n->partial);
- spin_unlock(&n->list_lock);
}
-static inline void __remove_partial(struct kmem_cache_node *n,
+/*
+ * list_lock must be held.
+ */
+static inline void remove_partial(struct kmem_cache_node *n,
struct page *page)
{
list_del(&page->lru);
n->nr_partial--;
}
-static void remove_partial(struct kmem_cache *s, struct page *page)
-{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-
- spin_lock(&n->list_lock);
- __remove_partial(n, page);
- spin_unlock(&n->list_lock);
-}
-
/*
- * Lock slab and remove from the partial list.
+ * Lock slab, remove from the partial list and put the object into the
+ * per cpu freelist.
*
* Must hold list_lock.
*/
-static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
- struct page *page)
+static inline int acquire_slab(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page)
{
- if (slab_trylock(page)) {
- __remove_partial(n, page);
- __SetPageSlubFrozen(page);
+ void *freelist;
+ unsigned long counters;
+ struct page new;
+
+ /*
+ * Zap the freelist and set the frozen bit.
+ * The old freelist is the list of objects for the
+ * per cpu allocation list.
+ */
+ do {
+ freelist = page->freelist;
+ counters = page->counters;
+ new.counters = counters;
+ new.inuse = page->objects;
+
+ VM_BUG_ON(new.frozen);
+ new.frozen = 1;
+
+ } while (!__cmpxchg_double_slab(s, page,
+ freelist, counters,
+ NULL, new.counters,
+ "lock and freeze"));
+
+ remove_partial(n, page);
+
+ if (freelist) {
+ /* Populate the per cpu freelist */
+ this_cpu_write(s->cpu_slab->freelist, freelist);
+ this_cpu_write(s->cpu_slab->page, page);
+ this_cpu_write(s->cpu_slab->node, page_to_nid(page));
return 1;
+ } else {
+ /*
+ * Slab page came from the wrong list. No object to allocate
+ * from. Put it onto the correct list and continue partial
+ * scan.
+ */
+ printk(KERN_ERR "SLUB: %s : Page without available objects on"
+ " partial list\n", s->name);
+ return 0;
}
- return 0;
}
/*
* Try to allocate a partial slab from a specific node.
*/
-static struct page *get_partial_node(struct kmem_cache_node *n)
+static struct page *get_partial_node(struct kmem_cache *s,
+ struct kmem_cache_node *n)
{
struct page *page;
@@ -1362,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
spin_lock(&n->list_lock);
list_for_each_entry(page, &n->partial, lru)
- if (lock_and_freeze_slab(n, page))
+ if (acquire_slab(s, n, page))
goto out;
page = NULL;
out:
@@ -1413,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
n->nr_partial > s->min_partial) {
- page = get_partial_node(n);
+ page = get_partial_node(s, n);
if (page) {
put_mems_allowed();
return page;
@@ -1433,98 +1692,237 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
struct page *page;
int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
- page = get_partial_node(get_node(s, searchnode));
- if (page || node != -1)
+ page = get_partial_node(s, get_node(s, searchnode));
+ if (page || node != NUMA_NO_NODE)
return page;
return get_any_partial(s, flags);
}
+#ifdef CONFIG_PREEMPT
/*
- * Move a page back to the lists.
- *
- * Must be called with the slab lock held.
- *
- * On exit the slab lock will have been dropped.
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
*/
-static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
- __releases(bitlock)
+#define TID_STEP 1
+#endif
+
+static inline unsigned long next_tid(unsigned long tid)
{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ return tid + TID_STEP;
+}
- __ClearPageSlubFrozen(page);
- if (page->inuse) {
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+ return tid % TID_STEP;
+}
- if (page->freelist) {
- add_partial(n, page, tail);
- stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
- } else {
- stat(s, DEACTIVATE_FULL);
- if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
- add_full(n, page);
- }
- slab_unlock(page);
- } else {
- stat(s, DEACTIVATE_EMPTY);
- if (n->nr_partial < s->min_partial) {
- /*
- * Adding an empty slab to the partial slabs in order
- * to avoid page allocator overhead. This slab needs
- * to come after the other slabs with objects in
- * so that the others get filled first. That way the
- * size of the partial list stays small.
- *
- * kmem_cache_shrink can reclaim any empty slabs from
- * the partial list.
- */
- add_partial(n, page, 1);
- slab_unlock(page);
- } else {
- slab_unlock(page);
- stat(s, FREE_SLAB);
- discard_slab(s, page);
- }
- }
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+ return tid / TID_STEP;
}
+static inline unsigned int init_tid(int cpu)
+{
+ return cpu;
+}
+
+static inline void note_cmpxchg_failure(const char *n,
+ const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef SLUB_DEBUG_CMPXCHG
+ unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+ printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+
+#ifdef CONFIG_PREEMPT
+ if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+ printk("due to cpu change %d -> %d\n",
+ tid_to_cpu(tid), tid_to_cpu(actual_tid));
+ else
+#endif
+ if (tid_to_event(tid) != tid_to_event(actual_tid))
+ printk("due to cpu running other code. Event %ld->%ld\n",
+ tid_to_event(tid), tid_to_event(actual_tid));
+ else
+ printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+ actual_tid, tid, next_tid(tid));
+#endif
+ stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
+}
+
+void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+}
+/*
+ * Remove the cpu slab
+ */
+
/*
* Remove the cpu slab
*/
static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
- __releases(bitlock)
{
+ enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
struct page *page = c->page;
- int tail = 1;
-
- if (page->freelist)
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ int lock = 0;
+ enum slab_modes l = M_NONE, m = M_NONE;
+ void *freelist;
+ void *nextfree;
+ int tail = 0;
+ struct page new;
+ struct page old;
+
+ if (page->freelist) {
stat(s, DEACTIVATE_REMOTE_FREES);
+ tail = 1;
+ }
+
+ c->tid = next_tid(c->tid);
+ c->page = NULL;
+ freelist = c->freelist;
+ c->freelist = NULL;
+
/*
- * Merge cpu freelist into slab freelist. Typically we get here
- * because both freelists are empty. So this is unlikely
- * to occur.
+ * Stage one: Free all available per cpu objects back
+ * to the page freelist while it is still frozen. Leave the
+ * last one.
+ *
+ * There is no need to take the list->lock because the page
+ * is still frozen.
*/
- while (unlikely(c->freelist)) {
- void **object;
+ while (freelist && (nextfree = get_freepointer(s, freelist))) {
+ void *prior;
+ unsigned long counters;
+
+ do {
+ prior = page->freelist;
+ counters = page->counters;
+ set_freepointer(s, freelist, prior);
+ new.counters = counters;
+ new.inuse--;
+ VM_BUG_ON(!new.frozen);
+
+ } while (!__cmpxchg_double_slab(s, page,
+ prior, counters,
+ freelist, new.counters,
+ "drain percpu freelist"));
+
+ freelist = nextfree;
+ }
- tail = 0; /* Hot objects. Put the slab first */
+ /*
+ * Stage two: Ensure that the page is unfrozen while the
+ * list presence reflects the actual number of objects
+ * during unfreeze.
+ *
+ * We setup the list membership and then perform a cmpxchg
+ * with the count. If there is a mismatch then the page
+ * is not unfrozen but the page is on the wrong list.
+ *
+ * Then we restart the process which may have to remove
+ * the page from the list that we just put it on again
+ * because the number of objects in the slab may have
+ * changed.
+ */
+redo:
+
+ old.freelist = page->freelist;
+ old.counters = page->counters;
+ VM_BUG_ON(!old.frozen);
+
+ /* Determine target state of the slab */
+ new.counters = old.counters;
+ if (freelist) {
+ new.inuse--;
+ set_freepointer(s, freelist, old.freelist);
+ new.freelist = freelist;
+ } else
+ new.freelist = old.freelist;
- /* Retrieve object from cpu_freelist */
- object = c->freelist;
- c->freelist = get_freepointer(s, c->freelist);
+ new.frozen = 0;
- /* And put onto the regular freelist */
- set_freepointer(s, object, page->freelist);
- page->freelist = object;
- page->inuse--;
+ if (!new.inuse && n->nr_partial > s->min_partial)
+ m = M_FREE;
+ else if (new.freelist) {
+ m = M_PARTIAL;
+ if (!lock) {
+ lock = 1;
+ /*
+ * Taking the spinlock removes the possiblity
+ * that acquire_slab() will see a slab page that
+ * is frozen
+ */
+ spin_lock(&n->list_lock);
+ }
+ } else {
+ m = M_FULL;
+ if (kmem_cache_debug(s) && !lock) {
+ lock = 1;
+ /*
+ * This also ensures that the scanning of full
+ * slabs from diagnostic functions will not see
+ * any frozen slabs.
+ */
+ spin_lock(&n->list_lock);
+ }
+ }
+
+ if (l != m) {
+
+ if (l == M_PARTIAL)
+
+ remove_partial(n, page);
+
+ else if (l == M_FULL)
+
+ remove_full(s, page);
+
+ if (m == M_PARTIAL) {
+
+ add_partial(n, page, tail);
+ stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
+
+ } else if (m == M_FULL) {
+
+ stat(s, DEACTIVATE_FULL);
+ add_full(s, n, page);
+
+ }
+ }
+
+ l = m;
+ if (!__cmpxchg_double_slab(s, page,
+ old.freelist, old.counters,
+ new.freelist, new.counters,
+ "unfreezing slab"))
+ goto redo;
+
+ if (lock)
+ spin_unlock(&n->list_lock);
+
+ if (m == M_FREE) {
+ stat(s, DEACTIVATE_EMPTY);
+ discard_slab(s, page);
+ stat(s, FREE_SLAB);
}
- c->page = NULL;
- unfreeze_slab(s, page, tail);
}
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
- slab_lock(c->page);
deactivate_slab(s, c);
}
@@ -1651,77 +2049,124 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void **object;
- struct page *new;
+ struct page *page;
+ unsigned long flags;
+ struct page new;
+ unsigned long counters;
+
+ local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+ /*
+ * We may have been preempted and rescheduled on a different
+ * cpu before disabling interrupts. Need to reload cpu area
+ * pointer.
+ */
+ c = this_cpu_ptr(s->cpu_slab);
+#endif
/* We handle __GFP_ZERO in the caller */
gfpflags &= ~__GFP_ZERO;
- if (!c->page)
+ page = c->page;
+ if (!page)
goto new_slab;
- slab_lock(c->page);
- if (unlikely(!node_match(c, node)))
- goto another_slab;
+ if (unlikely(!node_match(c, node))) {
+ stat(s, ALLOC_NODE_MISMATCH);
+ deactivate_slab(s, c);
+ goto new_slab;
+ }
+
+ stat(s, ALLOC_SLOWPATH);
+
+ do {
+ object = page->freelist;
+ counters = page->counters;
+ new.counters = counters;
+ VM_BUG_ON(!new.frozen);
+
+ /*
+ * If there is no object left then we use this loop to
+ * deactivate the slab which is simple since no objects
+ * are left in the slab and therefore we do not need to
+ * put the page back onto the partial list.
+ *
+ * If there are objects left then we retrieve them
+ * and use them to refill the per cpu queue.
+ */
+
+ new.inuse = page->objects;
+ new.frozen = object != NULL;
+
+ } while (!__cmpxchg_double_slab(s, page,
+ object, counters,
+ NULL, new.counters,
+ "__slab_alloc"));
+
+ if (unlikely(!object)) {
+ c->page = NULL;
+ stat(s, DEACTIVATE_BYPASS);
+ goto new_slab;
+ }
stat(s, ALLOC_REFILL);
load_freelist:
- object = c->page->freelist;
- if (unlikely(!object))
- goto another_slab;
- if (kmem_cache_debug(s))
- goto debug;
-
+ VM_BUG_ON(!page->frozen);
c->freelist = get_freepointer(s, object);
- c->page->inuse = c->page->objects;
- c->page->freelist = NULL;
- c->node = page_to_nid(c->page);
-unlock_out:
- slab_unlock(c->page);
- stat(s, ALLOC_SLOWPATH);
+ c->tid = next_tid(c->tid);
+ local_irq_restore(flags);
return object;
-another_slab:
- deactivate_slab(s, c);
-
new_slab:
- new = get_partial(s, gfpflags, node);
- if (new) {
- c->page = new;
+ page = get_partial(s, gfpflags, node);
+ if (page) {
stat(s, ALLOC_FROM_PARTIAL);
+ object = c->freelist;
+
+ if (kmem_cache_debug(s))
+ goto debug;
goto load_freelist;
}
- gfpflags &= gfp_allowed_mask;
- if (gfpflags & __GFP_WAIT)
- local_irq_enable();
-
- new = new_slab(s, gfpflags, node);
-
- if (gfpflags & __GFP_WAIT)
- local_irq_disable();
+ page = new_slab(s, gfpflags, node);
- if (new) {
+ if (page) {
c = __this_cpu_ptr(s->cpu_slab);
- stat(s, ALLOC_SLAB);
if (c->page)
flush_slab(s, c);
- slab_lock(new);
- __SetPageSlubFrozen(new);
- c->page = new;
+
+ /*
+ * No other reference to the page yet so we can
+ * muck around with it freely without cmpxchg
+ */
+ object = page->freelist;
+ page->freelist = NULL;
+ page->inuse = page->objects;
+
+ stat(s, ALLOC_SLAB);
+ c->node = page_to_nid(page);
+ c->page = page;
+
+ if (kmem_cache_debug(s))
+ goto debug;
goto load_freelist;
}
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
slab_out_of_memory(s, gfpflags, node);
+ local_irq_restore(flags);
return NULL;
+
debug:
- if (!alloc_debug_processing(s, c->page, object, addr))
- goto another_slab;
+ if (!object || !alloc_debug_processing(s, page, object, addr))
+ goto new_slab;
- c->page->inuse++;
- c->page->freelist = get_freepointer(s, object);
+ c->freelist = get_freepointer(s, object);
+ deactivate_slab(s, c);
+ c->page = NULL;
c->node = NUMA_NO_NODE;
- goto unlock_out;
+ local_irq_restore(flags);
+ return object;
}
/*
@@ -1739,23 +2184,58 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
{
void **object;
struct kmem_cache_cpu *c;
- unsigned long flags;
+ unsigned long tid;
if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
- local_irq_save(flags);
+redo:
+
+ /*
+ * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+ * enabled. We may switch back and forth between cpus while
+ * reading from one cpu area. That does not matter as long
+ * as we end up on the original cpu again when doing the cmpxchg.
+ */
c = __this_cpu_ptr(s->cpu_slab);
+
+ /*
+ * The transaction ids are globally unique per cpu and per operation on
+ * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+ * occurs on the right processor and that there was no operation on the
+ * linked list in between.
+ */
+ tid = c->tid;
+ barrier();
+
object = c->freelist;
if (unlikely(!object || !node_match(c, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
- c->freelist = get_freepointer(s, object);
+ /*
+ * The cmpxchg will only match if there was no additional
+ * operation and if we are on the right processor.
+ *
+ * The cmpxchg does the following atomically (without lock semantics!)
+ * 1. Relocate first pointer to the current per cpu area.
+ * 2. Verify that tid and freelist have not been changed
+ * 3. If they were not changed replace tid and freelist
+ *
+ * Since this is without lock semantics the protection is only against
+ * code executing on this cpu *not* from access by other cpus.
+ */
+ if (unlikely(!irqsafe_cpu_cmpxchg_double(
+ s->cpu_slab->freelist, s->cpu_slab->tid,
+ object, tid,
+ get_freepointer_safe(s, object), next_tid(tid)))) {
+
+ note_cmpxchg_failure("slab_alloc", s, tid);
+ goto redo;
+ }
stat(s, ALLOC_FASTPATH);
}
- local_irq_restore(flags);
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->objsize);
@@ -1833,57 +2313,91 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
{
void *prior;
void **object = (void *)x;
+ int was_frozen;
+ int inuse;
+ struct page new;
+ unsigned long counters;
+ struct kmem_cache_node *n = NULL;
+ unsigned long uninitialized_var(flags);
stat(s, FREE_SLOWPATH);
- slab_lock(page);
- if (kmem_cache_debug(s))
- goto debug;
+ if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
+ return;
-checks_ok:
- prior = page->freelist;
- set_freepointer(s, object, prior);
- page->freelist = object;
- page->inuse--;
+ do {
+ prior = page->freelist;
+ counters = page->counters;
+ set_freepointer(s, object, prior);
+ new.counters = counters;
+ was_frozen = new.frozen;
+ new.inuse--;
+ if ((!new.inuse || !prior) && !was_frozen && !n) {
+ n = get_node(s, page_to_nid(page));
+ /*
+ * Speculatively acquire the list_lock.
+ * If the cmpxchg does not succeed then we may
+ * drop the list_lock without any processing.
+ *
+ * Otherwise the list_lock will synchronize with
+ * other processors updating the list of slabs.
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
+ }
+ inuse = new.inuse;
- if (unlikely(PageSlubFrozen(page))) {
- stat(s, FREE_FROZEN);
- goto out_unlock;
- }
+ } while (!cmpxchg_double_slab(s, page,
+ prior, counters,
+ object, new.counters,
+ "__slab_free"));
- if (unlikely(!page->inuse))
- goto slab_empty;
+ if (likely(!n)) {
+ /*
+ * The list lock was not taken therefore no list
+ * activity can be necessary.
+ */
+ if (was_frozen)
+ stat(s, FREE_FROZEN);
+ return;
+ }
/*
- * Objects left in the slab. If it was not on the partial list before
- * then add it.
+ * was_frozen may have been set after we acquired the list_lock in
+ * an earlier loop. So we need to check it here again.
*/
- if (unlikely(!prior)) {
- add_partial(get_node(s, page_to_nid(page)), page, 1);
- stat(s, FREE_ADD_PARTIAL);
- }
+ if (was_frozen)
+ stat(s, FREE_FROZEN);
+ else {
+ if (unlikely(!inuse && n->nr_partial > s->min_partial))
+ goto slab_empty;
-out_unlock:
- slab_unlock(page);
+ /*
+ * Objects left in the slab. If it was not on the partial list before
+ * then add it.
+ */
+ if (unlikely(!prior)) {
+ remove_full(s, page);
+ add_partial(n, page, 0);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ }
+ spin_unlock_irqrestore(&n->list_lock, flags);
return;
slab_empty:
if (prior) {
/*
- * Slab still on the partial list.
+ * Slab on the partial list.
*/
- remove_partial(s, page);
+ remove_partial(n, page);
stat(s, FREE_REMOVE_PARTIAL);
- }
- slab_unlock(page);
+ } else
+ /* Slab must be on the full list */
+ remove_full(s, page);
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
discard_slab(s, page);
- return;
-
-debug:
- if (!free_debug_processing(s, page, x, addr))
- goto out_unlock;
- goto checks_ok;
}
/*
@@ -1902,23 +2416,38 @@ static __always_inline void slab_free(struct kmem_cache *s,
{
void **object = (void *)x;
struct kmem_cache_cpu *c;
- unsigned long flags;
+ unsigned long tid;
slab_free_hook(s, x);
- local_irq_save(flags);
+redo:
+
+ /*
+ * Determine the currently cpus per cpu slab.
+ * The cpu may change afterward. However that does not matter since
+ * data is retrieved via this pointer. If we are on the same cpu
+ * during the cmpxchg then the free will succedd.
+ */
c = __this_cpu_ptr(s->cpu_slab);
- slab_free_hook_irq(s, x);
+ tid = c->tid;
+ barrier();
- if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
+ if (likely(page == c->page)) {
set_freepointer(s, object, c->freelist);
- c->freelist = object;
+
+ if (unlikely(!irqsafe_cpu_cmpxchg_double(
+ s->cpu_slab->freelist, s->cpu_slab->tid,
+ c->freelist, tid,
+ object, next_tid(tid)))) {
+
+ note_cmpxchg_failure("slab_free", s, tid);
+ goto redo;
+ }
stat(s, FREE_FASTPATH);
} else
__slab_free(s, page, x, addr);
- local_irq_restore(flags);
}
void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -1988,13 +2517,13 @@ static int slub_nomerge;
* the smallest order which will fit the object.
*/
static inline int slab_order(int size, int min_objects,
- int max_order, int fract_leftover)
+ int max_order, int fract_leftover, int reserved)
{
int order;
int rem;
int min_order = slub_min_order;
- if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
+ if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
return get_order(size * MAX_OBJS_PER_PAGE) - 1;
for (order = max(min_order,
@@ -2003,10 +2532,10 @@ static inline int slab_order(int size, int min_objects,
unsigned long slab_size = PAGE_SIZE << order;
- if (slab_size < min_objects * size)
+ if (slab_size < min_objects * size + reserved)
continue;
- rem = slab_size % size;
+ rem = (slab_size - reserved) % size;
if (rem <= slab_size / fract_leftover)
break;
@@ -2016,7 +2545,7 @@ static inline int slab_order(int size, int min_objects,
return order;
}
-static inline int calculate_order(int size)
+static inline int calculate_order(int size, int reserved)
{
int order;
int min_objects;
@@ -2034,14 +2563,14 @@ static inline int calculate_order(int size)
min_objects = slub_min_objects;
if (!min_objects)
min_objects = 4 * (fls(nr_cpu_ids) + 1);
- max_objects = (PAGE_SIZE << slub_max_order)/size;
+ max_objects = order_objects(slub_max_order, size, reserved);
min_objects = min(min_objects, max_objects);
while (min_objects > 1) {
fraction = 16;
while (fraction >= 4) {
order = slab_order(size, min_objects,
- slub_max_order, fraction);
+ slub_max_order, fraction, reserved);
if (order <= slub_max_order)
return order;
fraction /= 2;
@@ -2053,14 +2582,14 @@ static inline int calculate_order(int size)
* We were unable to place multiple objects in a slab. Now
* lets see if we can place a single object there.
*/
- order = slab_order(size, 1, slub_max_order, 1);
+ order = slab_order(size, 1, slub_max_order, 1, reserved);
if (order <= slub_max_order)
return order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
- order = slab_order(size, 1, MAX_ORDER, 1);
+ order = slab_order(size, 1, MAX_ORDER, 1, reserved);
if (order < MAX_ORDER)
return order;
return -ENOSYS;
@@ -2110,9 +2639,19 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
- s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+ /*
+ * Must align to double word boundary for the double cmpxchg
+ * instructions to work; see __pcpu_double_call_return_bool().
+ */
+ s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
+ 2 * sizeof(void *));
+
+ if (!s->cpu_slab)
+ return 0;
+
+ init_kmem_cache_cpus(s);
- return s->cpu_slab != NULL;
+ return 1;
}
static struct kmem_cache *kmem_cache_node;
@@ -2130,7 +2669,6 @@ static void early_kmem_cache_node_alloc(int node)
{
struct page *page;
struct kmem_cache_node *n;
- unsigned long flags;
BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
@@ -2148,6 +2686,7 @@ static void early_kmem_cache_node_alloc(int node)
BUG_ON(!n);
page->freelist = get_freepointer(kmem_cache_node, n);
page->inuse++;
+ page->frozen = 0;
kmem_cache_node->node[node] = n;
#ifdef CONFIG_SLUB_DEBUG
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
@@ -2156,14 +2695,7 @@ static void early_kmem_cache_node_alloc(int node)
init_kmem_cache_node(n, kmem_cache_node);
inc_slabs_node(kmem_cache_node, node, page->objects);
- /*
- * lockdep requires consistent irq usage for each lock
- * so even though there cannot be a race this early in
- * the boot sequence, we still disable irqs.
- */
- local_irq_save(flags);
add_partial(n, page, 0);
- local_irq_restore(flags);
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2311,7 +2843,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
if (forced_order >= 0)
order = forced_order;
else
- order = calculate_order(size);
+ order = calculate_order(size, s->reserved);
if (order < 0)
return 0;
@@ -2329,8 +2861,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/*
* Determine the number of objects per slab
*/
- s->oo = oo_make(order, size);
- s->min = oo_make(get_order(size), size);
+ s->oo = oo_make(order, size, s->reserved);
+ s->min = oo_make(get_order(size), size, s->reserved);
if (oo_objects(s->oo) > oo_objects(s->max))
s->max = s->oo;
@@ -2349,6 +2881,10 @@ static int kmem_cache_open(struct kmem_cache *s,
s->objsize = size;
s->align = align;
s->flags = kmem_cache_flags(size, flags, name, ctor);
+ s->reserved = 0;
+
+ if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
+ s->reserved = sizeof(struct rcu_head);
if (!calculate_sizes(s, -1))
goto error;
@@ -2365,6 +2901,12 @@ static int kmem_cache_open(struct kmem_cache *s,
}
}
+#ifdef CONFIG_CMPXCHG_DOUBLE
+ if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
+ /* Enable fast mode */
+ s->flags |= __CMPXCHG_DOUBLE;
+#endif
+
/*
* The larger the object size is, the more pages we want on the partial
* list to avoid pounding the page allocator excessively.
@@ -2399,12 +2941,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *s)
-{
- return s->name;
-}
-EXPORT_SYMBOL(kmem_cache_name);
-
static void list_slab_objects(struct kmem_cache *s, struct page *page,
const char *text)
{
@@ -2417,9 +2953,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
return;
slab_err(s, page, "%s", text);
slab_lock(page);
- for_each_free_object(p, s, page->freelist)
- set_bit(slab_index(p, s, addr), map);
+ get_map(s, page, map);
for_each_object(p, s, addr, page->objects) {
if (!test_bit(slab_index(p, s, addr), map)) {
@@ -2444,7 +2979,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- __remove_partial(n, page);
+ remove_partial(n, page);
discard_slab(s, page);
} else {
list_slab_objects(s, page,
@@ -2696,7 +3231,6 @@ EXPORT_SYMBOL(__kmalloc_node);
size_t ksize(const void *object)
{
struct page *page;
- struct kmem_cache *s;
if (unlikely(object == ZERO_SIZE_PTR))
return 0;
@@ -2707,30 +3241,46 @@ size_t ksize(const void *object)
WARN_ON(!PageCompound(page));
return PAGE_SIZE << compound_order(page);
}
- s = page->slab;
+
+ return slab_ksize(page->slab);
+}
+EXPORT_SYMBOL(ksize);
#ifdef CONFIG_SLUB_DEBUG
- /*
- * Debugging requires use of the padding between object
- * and whatever may come after it.
- */
- if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
- return s->objsize;
+bool verify_mem_not_deleted(const void *x)
+{
+ struct page *page;
+ void *object = (void *)x;
+ unsigned long flags;
+ bool rv;
-#endif
- /*
- * If we have the need to store the freelist pointer
- * back there or track user information then we can
- * only use the space before that information.
- */
- if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
- return s->inuse;
- /*
- * Else we can use all the padding etc for the allocation
- */
- return s->size;
+ if (unlikely(ZERO_OR_NULL_PTR(x)))
+ return false;
+
+ local_irq_save(flags);
+
+ page = virt_to_head_page(x);
+ if (unlikely(!PageSlab(page))) {
+ /* maybe it was from stack? */
+ rv = true;
+ goto out_unlock;
+ }
+
+ slab_lock(page);
+ if (on_freelist(page->slab, page, object)) {
+ object_err(page->slab, page, object, "Object is on free-list");
+ rv = false;
+ } else {
+ rv = true;
+ }
+ slab_unlock(page);
+
+out_unlock:
+ local_irq_restore(flags);
+ return rv;
}
-EXPORT_SYMBOL(ksize);
+EXPORT_SYMBOL(verify_mem_not_deleted);
+#endif
void kfree(const void *x)
{
@@ -2797,14 +3347,8 @@ int kmem_cache_shrink(struct kmem_cache *s)
* list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
- if (!page->inuse && slab_trylock(page)) {
- /*
- * Must hold slab lock here because slab_free
- * may have freed the last object and be
- * waiting to release the slab.
- */
- __remove_partial(n, page);
- slab_unlock(page);
+ if (!page->inuse) {
+ remove_partial(n, page);
discard_slab(s, page);
} else {
list_move(&page->lru,
@@ -2968,7 +3512,7 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
list_for_each_entry(p, &n->partial, lru)
p->slab = s;
-#ifdef CONFIG_SLAB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG
list_for_each_entry(p, &n->full, lru)
p->slab = s;
#endif
@@ -3312,7 +3856,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
- /* Honor the call site pointer we recieved. */
+ /* Honor the call site pointer we received. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
return ret;
@@ -3342,7 +3886,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
ret = slab_alloc(s, gfpflags, node, caller);
- /* Honor the call site pointer we recieved. */
+ /* Honor the call site pointer we received. */
trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
return ret;
@@ -3375,10 +3919,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
/* Now we know that a valid freelist exists */
bitmap_zero(map, page->objects);
- for_each_free_object(p, s, page->freelist) {
- set_bit(slab_index(p, s, addr), map);
- if (!check_object(s, page, p, SLUB_RED_INACTIVE))
- return 0;
+ get_map(s, page, map);
+ for_each_object(p, s, addr, page->objects) {
+ if (test_bit(slab_index(p, s, addr), map))
+ if (!check_object(s, page, p, SLUB_RED_INACTIVE))
+ return 0;
}
for_each_object(p, s, addr, page->objects)
@@ -3391,12 +3936,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
static void validate_slab_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
- if (slab_trylock(page)) {
- validate_slab(s, page, map);
- slab_unlock(page);
- } else
- printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
- s->name, page);
+ slab_lock(page);
+ validate_slab(s, page, map);
+ slab_unlock(page);
}
static int validate_slab_node(struct kmem_cache *s,
@@ -3586,8 +4128,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
void *p;
bitmap_zero(map, page->objects);
- for_each_free_object(p, s, page->freelist)
- set_bit(slab_index(p, s, addr), map);
+ get_map(s, page, map);
for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
@@ -3862,7 +4403,7 @@ static int any_slab_objects(struct kmem_cache *s)
#endif
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
-#define to_slab(n) container_of(n, struct kmem_cache, kobj);
+#define to_slab(n) container_of(n, struct kmem_cache, kobj)
struct slab_attribute {
struct attribute attr;
@@ -4017,6 +4558,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(destroy_by_rcu);
+static ssize_t reserved_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->reserved);
+}
+SLAB_ATTR_RO(reserved);
+
#ifdef CONFIG_SLUB_DEBUG
static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
@@ -4039,8 +4586,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s,
const char *buf, size_t length)
{
s->flags &= ~SLAB_DEBUG_FREE;
- if (buf[0] == '1')
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_DEBUG_FREE;
+ }
return length;
}
SLAB_ATTR(sanity_checks);
@@ -4054,8 +4603,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
size_t length)
{
s->flags &= ~SLAB_TRACE;
- if (buf[0] == '1')
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_TRACE;
+ }
return length;
}
SLAB_ATTR(trace);
@@ -4072,8 +4623,10 @@ static ssize_t red_zone_store(struct kmem_cache *s,
return -EBUSY;
s->flags &= ~SLAB_RED_ZONE;
- if (buf[0] == '1')
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_RED_ZONE;
+ }
calculate_sizes(s, -1);
return length;
}
@@ -4091,8 +4644,10 @@ static ssize_t poison_store(struct kmem_cache *s,
return -EBUSY;
s->flags &= ~SLAB_POISON;
- if (buf[0] == '1')
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_POISON;
+ }
calculate_sizes(s, -1);
return length;
}
@@ -4110,8 +4665,10 @@ static ssize_t store_user_store(struct kmem_cache *s,
return -EBUSY;
s->flags &= ~SLAB_STORE_USER;
- if (buf[0] == '1')
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_STORE_USER;
+ }
calculate_sizes(s, -1);
return length;
}
@@ -4276,6 +4833,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
STAT_ATTR(ALLOC_SLAB, alloc_slab);
STAT_ATTR(ALLOC_REFILL, alloc_refill);
+STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
STAT_ATTR(FREE_SLAB, free_slab);
STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
@@ -4283,7 +4841,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
+STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
STAT_ATTR(ORDER_FALLBACK, order_fallback);
+STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
+STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
#endif
static struct attribute *slab_attrs[] = {
@@ -4303,6 +4864,7 @@ static struct attribute *slab_attrs[] = {
&reclaim_account_attr.attr,
&destroy_by_rcu_attr.attr,
&shrink_attr.attr,
+ &reserved_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
&slabs_attr.attr,
@@ -4332,6 +4894,7 @@ static struct attribute *slab_attrs[] = {
&alloc_from_partial_attr.attr,
&alloc_slab_attr.attr,
&alloc_refill_attr.attr,
+ &alloc_node_mismatch_attr.attr,
&free_slab_attr.attr,
&cpuslab_flush_attr.attr,
&deactivate_full_attr.attr,
@@ -4339,7 +4902,10 @@ static struct attribute *slab_attrs[] = {
&deactivate_to_head_attr.attr,
&deactivate_to_tail_attr.attr,
&deactivate_remote_frees_attr.attr,
+ &deactivate_bypass_attr.attr,
&order_fallback_attr.attr,
+ &cmpxchg_double_fail_attr.attr,
+ &cmpxchg_double_cpu_fail_attr.attr,
#endif
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index 93250207c5c..858e1dff9b2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif
-int page_to_nid(struct page *page)
+int page_to_nid(const struct page *page)
{
return section_to_node_table[page_to_section(page)];
}
@@ -500,7 +500,7 @@ void __init sparse_init(void)
* so alloc 2M (with 2M align) and 24 bytes in turn will
* make next 2M slip to one more 2M later.
* then in big system, the memory will have a lot of holes...
- * here try to allocate 2M pages continously.
+ * here try to allocate 2M pages continuously.
*
* powerpc need to call sparse_init_one_section right after each
* sparse_early_mem_map_alloc, so allocate usemap_map at first.
diff --git a/mm/swap.c b/mm/swap.c
index c02f93611a8..3a442f18b0b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,6 +39,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -178,15 +179,13 @@ void put_pages_list(struct list_head *pages)
}
EXPORT_SYMBOL(put_pages_list);
-/*
- * pagevec_move_tail() must be called with IRQ disabled.
- * Otherwise this may cause nasty races.
- */
-static void pagevec_move_tail(struct pagevec *pvec)
+static void pagevec_lru_move_fn(struct pagevec *pvec,
+ void (*move_fn)(struct page *page, void *arg),
+ void *arg)
{
int i;
- int pgmoved = 0;
struct zone *zone = NULL;
+ unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -194,29 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec)
if (pagezone != zone) {
if (zone)
- spin_unlock(&zone->lru_lock);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
zone = pagezone;
- spin_lock(&zone->lru_lock);
- }
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int lru = page_lru_base_type(page);
- list_move_tail(&page->lru, &zone->lru[lru].list);
- pgmoved++;
+ spin_lock_irqsave(&zone->lru_lock, flags);
}
+
+ (*move_fn)(page, arg);
}
if (zone)
- spin_unlock(&zone->lru_lock);
- __count_vm_events(PGROTATED, pgmoved);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
+static void pagevec_move_tail_fn(struct page *page, void *arg)
+{
+ int *pgmoved = arg;
+ struct zone *zone = page_zone(page);
+
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ enum lru_list lru = page_lru_base_type(page);
+ list_move_tail(&page->lru, &zone->lru[lru].list);
+ mem_cgroup_rotate_reclaimable_page(page);
+ (*pgmoved)++;
+ }
+}
+
+/*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+ int pgmoved = 0;
+
+ pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
+ __count_vm_events(PGROTATED, pgmoved);
+}
+
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
* inactive list.
*/
-void rotate_reclaimable_page(struct page *page)
+void rotate_reclaimable_page(struct page *page)
{
if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
!PageUnevictable(page) && PageLRU(page)) {
@@ -252,14 +272,10 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
memcg_reclaim_stat->recent_rotated[file]++;
}
-/*
- * FIXME: speed this up?
- */
-void activate_page(struct page *page)
+static void __activate_page(struct page *page, void *arg)
{
struct zone *zone = page_zone(page);
- spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
int file = page_is_file_cache(page);
int lru = page_lru_base_type(page);
@@ -272,8 +288,45 @@ void activate_page(struct page *page)
update_page_reclaim_stat(zone, page, file, 1);
}
+}
+
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+
+static void activate_page_drain(int cpu)
+{
+ struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+
+void activate_page(struct page *page)
+{
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+ put_cpu_var(activate_page_pvecs);
+ }
+}
+
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+
+void activate_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ __activate_page(page, NULL);
spin_unlock_irq(&zone->lru_lock);
}
+#endif
/*
* Mark a page as having seen activity.
@@ -347,6 +400,74 @@ void add_page_to_unevictable_list(struct page *page)
}
/*
+ * If the page can not be invalidated, it is moved to the
+ * inactive list to speed up its reclaim. It is moved to the
+ * head of the list, rather than the tail, to give the flusher
+ * threads some time to write it out, as this is much more
+ * effective than the single-page writeout from reclaim.
+ *
+ * If the page isn't page_mapped and dirty/writeback, the page
+ * could reclaim asap using PG_reclaim.
+ *
+ * 1. active, mapped page -> none
+ * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
+ * 3. inactive, mapped page -> none
+ * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
+ * 5. inactive, clean -> inactive, tail
+ * 6. Others -> none
+ *
+ * In 4, why it moves inactive's head, the VM expects the page would
+ * be write it out by flusher threads as this is much more effective
+ * than the single-page writeout from reclaim.
+ */
+static void lru_deactivate_fn(struct page *page, void *arg)
+{
+ int lru, file;
+ bool active;
+ struct zone *zone = page_zone(page);
+
+ if (!PageLRU(page))
+ return;
+
+ if (PageUnevictable(page))
+ return;
+
+ /* Some processes are using the page */
+ if (page_mapped(page))
+ return;
+
+ active = PageActive(page);
+
+ file = page_is_file_cache(page);
+ lru = page_lru_base_type(page);
+ del_page_from_lru_list(zone, page, lru + active);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(zone, page, lru);
+
+ if (PageWriteback(page) || PageDirty(page)) {
+ /*
+ * PG_reclaim could be raced with end_page_writeback
+ * It can make readahead confusing. But race window
+ * is _really_ small and it's non-critical problem.
+ */
+ SetPageReclaim(page);
+ } else {
+ /*
+ * The page's writeback ends up during pagevec
+ * We moves tha page into tail of inactive.
+ */
+ list_move_tail(&page->lru, &zone->lru[lru].list);
+ mem_cgroup_rotate_reclaimable_page(page);
+ __count_vm_event(PGROTATED);
+ }
+
+ if (active)
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(zone, page, file, 0);
+}
+
+/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
* disabled; or "cpu" is being hot-unplugged, and is already dead.
@@ -372,6 +493,38 @@ static void drain_cpu_pagevecs(int cpu)
pagevec_move_tail(pvec);
local_irq_restore(flags);
}
+
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
+ activate_page_drain(cpu);
+}
+
+/**
+ * deactivate_page - forcefully deactivate a page
+ * @page: page to deactivate
+ *
+ * This function hints the VM that @page is a good reclaim candidate,
+ * for example if its invalidation fails due to the page being dirty
+ * or under writeback.
+ */
+void deactivate_page(struct page *page)
+{
+ /*
+ * In a workload with many unevictable page such as mprotect, unevictable
+ * page deactivation for accelerating reclaim is pointless.
+ */
+ if (PageUnevictable(page))
+ return;
+
+ if (likely(get_page_unless_zero(page))) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
}
void lru_add_drain(void)
@@ -516,44 +669,33 @@ void lru_add_page_tail(struct zone* zone,
}
}
+static void ____pagevec_lru_add_fn(struct page *page, void *arg)
+{
+ enum lru_list lru = (enum lru_list)arg;
+ struct zone *zone = page_zone(page);
+ int file = is_file_lru(lru);
+ int active = is_active_lru(lru);
+
+ VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(PageUnevictable(page));
+ VM_BUG_ON(PageLRU(page));
+
+ SetPageLRU(page);
+ if (active)
+ SetPageActive(page);
+ update_page_reclaim_stat(zone, page, file, active);
+ add_page_to_lru_list(zone, page, lru);
+}
+
/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
{
- int i;
- struct zone *zone = NULL;
-
VM_BUG_ON(is_unevictable_lru(lru));
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct zone *pagezone = page_zone(page);
- int file;
- int active;
-
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
- VM_BUG_ON(PageActive(page));
- VM_BUG_ON(PageUnevictable(page));
- VM_BUG_ON(PageLRU(page));
- SetPageLRU(page);
- active = is_active_lru(lru);
- file = is_file_lru(lru);
- if (active)
- SetPageActive(page);
- update_page_reclaim_stat(zone, page, file, active);
- add_page_to_lru_list(zone, page, lru);
- }
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- release_pages(pvec->pages, pvec->nr, pvec->cold);
- pagevec_reinit(pvec);
+ pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
}
EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5c8cfabbc9b..46680461785 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,12 +24,10 @@
/*
* swapper_space is a fiction, retained to simplify the path through
- * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
- * future use of radix_tree tags in the swap cache.
+ * vmscan's shrink_page_list.
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .sync_page = block_sync_page,
.set_page_dirty = __set_page_dirty_nobuffers,
.migratepage = migrate_page,
};
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = {
static struct backing_dev_info swap_backing_dev_info = {
.name = "swap",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
- .unplug_io_fn = swap_unplug_io_fn,
};
struct address_space swapper_space = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0341c5700e3..17bc224bce6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,7 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
-#include <linux/shm.h>
+#include <linux/shmem_fs.h>
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/writeback.h>
@@ -31,6 +31,7 @@
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
#include <linux/poll.h>
+#include <linux/oom.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -95,39 +96,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
}
/*
- * We need this because the bdev->unplug_fn can sleep and we cannot
- * hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a mutex.
- */
-static DECLARE_RWSEM(swap_unplug_sem);
-
-void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
-{
- swp_entry_t entry;
-
- down_read(&swap_unplug_sem);
- entry.val = page_private(page);
- if (PageSwapCache(page)) {
- struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
- struct backing_dev_info *bdi;
-
- /*
- * If the page is removed from swapcache from under us (with a
- * racy try_to_unuse/swapoff) we need an additional reference
- * count to avoid reading garbage from page_private(page) above.
- * If the WARN_ON triggers during a swapoff it maybe the race
- * condition and it's harmless. However if it triggers without
- * swapoff it signals a problem.
- */
- WARN_ON(page_count(page) <= 1);
-
- bdi = bdev->bd_inode->i_mapping->backing_dev_info;
- blk_run_backing_dev(bdi, page);
- }
- up_read(&swap_unplug_sem);
-}
-
-/*
* swapon tell device that all the old swap contents can be discarded,
* to allow the swap device to optimize its wear-levelling.
*/
@@ -212,8 +180,8 @@ static int wait_for_discard(void *word)
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256
-static inline unsigned long scan_swap_map(struct swap_info_struct *si,
- unsigned char usage)
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+ unsigned char usage)
{
unsigned long offset;
unsigned long scan_base;
@@ -880,7 +848,7 @@ unsigned int count_swap_pages(int type, int free)
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
- struct mem_cgroup *ptr = NULL;
+ struct mem_cgroup *ptr;
spinlock_t *ptl;
pte_t *pte;
int ret = 1;
@@ -1550,6 +1518,36 @@ bad_bmap:
goto out;
}
+static void enable_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map)
+{
+ int i, prev;
+
+ spin_lock(&swap_lock);
+ if (prio >= 0)
+ p->prio = prio;
+ else
+ p->prio = --least_priority;
+ p->swap_map = swap_map;
+ p->flags |= SWP_WRITEOK;
+ nr_swap_pages += p->pages;
+ total_swap_pages += p->pages;
+
+ /* insert swap space into swap_list: */
+ prev = -1;
+ for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+ if (p->prio >= swap_info[i]->prio)
+ break;
+ prev = i;
+ }
+ p->next = i;
+ if (prev < 0)
+ swap_list.head = swap_list.next = p->type;
+ else
+ swap_info[prev]->next = p->type;
+ spin_unlock(&swap_lock);
+}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -1558,6 +1556,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
char *pathname;
+ int oom_score_adj;
int i, type, prev;
int err;
@@ -1616,37 +1615,22 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->flags &= ~SWP_WRITEOK;
spin_unlock(&swap_lock);
- current->flags |= PF_OOM_ORIGIN;
+ oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
err = try_to_unuse(type);
- current->flags &= ~PF_OOM_ORIGIN;
+ test_set_oom_score_adj(oom_score_adj);
if (err) {
+ /*
+ * reading p->prio and p->swap_map outside the lock is
+ * safe here because only sys_swapon and sys_swapoff
+ * change them, and there can be no other sys_swapon or
+ * sys_swapoff for this swap_info_struct at this point.
+ */
/* re-insert swap space back into swap_list */
- spin_lock(&swap_lock);
- if (p->prio < 0)
- p->prio = --least_priority;
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
- if (p->prio >= swap_info[i]->prio)
- break;
- prev = i;
- }
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = type;
- else
- swap_info[prev]->next = type;
- nr_swap_pages += p->pages;
- total_swap_pages += p->pages;
- p->flags |= SWP_WRITEOK;
- spin_unlock(&swap_lock);
+ enable_swap_info(p, p->prio, p->swap_map);
goto out_dput;
}
- /* wait for any unplug function to finish */
- down_write(&swap_unplug_sem);
- up_write(&swap_unplug_sem);
-
destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
@@ -1697,19 +1681,14 @@ out:
}
#ifdef CONFIG_PROC_FS
-struct proc_swaps {
- struct seq_file seq;
- int event;
-};
-
static unsigned swaps_poll(struct file *file, poll_table *wait)
{
- struct proc_swaps *s = file->private_data;
+ struct seq_file *seq = file->private_data;
poll_wait(file, &proc_poll_wait, wait);
- if (s->event != atomic_read(&proc_poll_event)) {
- s->event = atomic_read(&proc_poll_event);
+ if (seq->poll_event != atomic_read(&proc_poll_event)) {
+ seq->poll_event = atomic_read(&proc_poll_event);
return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
}
@@ -1799,24 +1778,16 @@ static const struct seq_operations swaps_op = {
static int swaps_open(struct inode *inode, struct file *file)
{
- struct proc_swaps *s;
+ struct seq_file *seq;
int ret;
- s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- file->private_data = s;
-
ret = seq_open(file, &swaps_op);
- if (ret) {
- kfree(s);
+ if (ret)
return ret;
- }
- s->seq.private = s;
- s->event = atomic_read(&proc_poll_event);
- return ret;
+ seq = file->private_data;
+ seq->poll_event = atomic_read(&proc_poll_event);
+ return 0;
}
static const struct file_operations proc_swaps_operations = {
@@ -1844,49 +1815,24 @@ static int __init max_swapfiles_check(void)
late_initcall(max_swapfiles_check);
#endif
-/*
- * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
- *
- * The swapon system call
- */
-SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+static struct swap_info_struct *alloc_swap_info(void)
{
struct swap_info_struct *p;
- char *name = NULL;
- struct block_device *bdev = NULL;
- struct file *swap_file = NULL;
- struct address_space *mapping;
unsigned int type;
- int i, prev;
- int error;
- union swap_header *swap_header;
- unsigned int nr_good_pages;
- int nr_extents = 0;
- sector_t span;
- unsigned long maxpages;
- unsigned long swapfilepages;
- unsigned char *swap_map = NULL;
- struct page *page = NULL;
- struct inode *inode = NULL;
- int did_down = 0;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
if (!(swap_info[type]->flags & SWP_USED))
break;
}
- error = -EPERM;
if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
kfree(p);
- goto out;
+ return ERR_PTR(-EPERM);
}
if (type >= nr_swapfiles) {
p->type = type;
@@ -1911,81 +1857,49 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->next = -1;
spin_unlock(&swap_lock);
- name = getname(specialfile);
- error = PTR_ERR(name);
- if (IS_ERR(name)) {
- name = NULL;
- goto bad_swap_2;
- }
- swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
- error = PTR_ERR(swap_file);
- if (IS_ERR(swap_file)) {
- swap_file = NULL;
- goto bad_swap_2;
- }
-
- p->swap_file = swap_file;
- mapping = swap_file->f_mapping;
- inode = mapping->host;
-
- error = -EBUSY;
- for (i = 0; i < nr_swapfiles; i++) {
- struct swap_info_struct *q = swap_info[i];
+ return p;
+}
- if (i == type || !q->swap_file)
- continue;
- if (mapping == q->swap_file->f_mapping)
- goto bad_swap;
- }
+static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
+{
+ int error;
- error = -EINVAL;
if (S_ISBLK(inode->i_mode)) {
- bdev = bdgrab(I_BDEV(inode));
- error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ p->bdev = bdgrab(I_BDEV(inode));
+ error = blkdev_get(p->bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL,
sys_swapon);
if (error < 0) {
- bdev = NULL;
- error = -EINVAL;
- goto bad_swap;
+ p->bdev = NULL;
+ return -EINVAL;
}
- p->old_block_size = block_size(bdev);
- error = set_blocksize(bdev, PAGE_SIZE);
+ p->old_block_size = block_size(p->bdev);
+ error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
- goto bad_swap;
- p->bdev = bdev;
+ return error;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
mutex_lock(&inode->i_mutex);
- did_down = 1;
- if (IS_SWAPFILE(inode)) {
- error = -EBUSY;
- goto bad_swap;
- }
- } else {
- goto bad_swap;
- }
+ if (IS_SWAPFILE(inode))
+ return -EBUSY;
+ } else
+ return -EINVAL;
- swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
+ return 0;
+}
- /*
- * Read the swap header.
- */
- if (!mapping->a_ops->readpage) {
- error = -EINVAL;
- goto bad_swap;
- }
- page = read_mapping_page(mapping, 0, swap_file);
- if (IS_ERR(page)) {
- error = PTR_ERR(page);
- goto bad_swap;
- }
- swap_header = kmap(page);
+static unsigned long read_swap_header(struct swap_info_struct *p,
+ union swap_header *swap_header,
+ struct inode *inode)
+{
+ int i;
+ unsigned long maxpages;
+ unsigned long swapfilepages;
if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
printk(KERN_ERR "Unable to find swap-space signature\n");
- error = -EINVAL;
- goto bad_swap;
+ return 0;
}
/* swap partition endianess hack... */
@@ -2001,8 +1915,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
printk(KERN_WARNING
"Unable to handle swap header version %d\n",
swap_header->info.version);
- error = -EINVAL;
- goto bad_swap;
+ return 0;
}
p->lowest_bit = 1;
@@ -2011,20 +1924,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
/*
* Find out how many pages are allowed for a single swap
- * device. There are two limiting factors: 1) the number of
- * bits for the swap offset in the swp_entry_t type and
- * 2) the number of bits in the a swap pte as defined by
- * the different architectures. In order to find the
- * largest possible bit mask a swap entry with swap type 0
+ * device. There are three limiting factors: 1) the number
+ * of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte as defined by the
+ * the different architectures, and 3) the number of free bits
+ * in an exceptional radix_tree entry. In order to find the
+ * largest possible bit mask, a swap entry with swap type 0
* and swap offset ~0UL is created, encoded to a swap pte,
- * decoded to a swp_entry_t again and finally the swap
+ * decoded to a swp_entry_t again, and finally the swap
* offset is extracted. This will mask all the bits from
* the initial ~0UL mask that can't be encoded in either
* the swp_entry_t or the architecture definition of a
- * swap pte.
+ * swap pte. Then the same is done for a radix_tree entry.
*/
maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ swp_entry_to_pte(swp_entry(0, ~0UL))));
+ maxpages = swp_offset(radix_to_swp_entry(
+ swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
+
if (maxpages > swap_header->info.last_page) {
maxpages = swap_header->info.last_page + 1;
/* p->max is an unsigned int: don't overflow it */
@@ -2033,62 +1950,156 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
p->highest_bit = maxpages - 1;
- error = -EINVAL;
if (!maxpages)
- goto bad_swap;
+ return 0;
+ swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
if (swapfilepages && maxpages > swapfilepages) {
printk(KERN_WARNING
"Swap area shorter than signature indicates\n");
- goto bad_swap;
+ return 0;
}
if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
- goto bad_swap;
+ return 0;
if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
- goto bad_swap;
+ return 0;
- /* OK, set up the swap map and apply the bad block list */
- swap_map = vmalloc(maxpages);
- if (!swap_map) {
- error = -ENOMEM;
- goto bad_swap;
- }
+ return maxpages;
+}
+
+static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ union swap_header *swap_header,
+ unsigned char *swap_map,
+ unsigned long maxpages,
+ sector_t *span)
+{
+ int i;
+ unsigned int nr_good_pages;
+ int nr_extents;
- memset(swap_map, 0, maxpages);
nr_good_pages = maxpages - 1; /* omit header page */
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
- if (page_nr == 0 || page_nr > swap_header->info.last_page) {
- error = -EINVAL;
- goto bad_swap;
- }
+ if (page_nr == 0 || page_nr > swap_header->info.last_page)
+ return -EINVAL;
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
nr_good_pages--;
}
}
- error = swap_cgroup_swapon(type, maxpages);
- if (error)
- goto bad_swap;
-
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
p->max = maxpages;
p->pages = nr_good_pages;
- nr_extents = setup_swap_extents(p, &span);
- if (nr_extents < 0) {
- error = nr_extents;
- goto bad_swap;
- }
+ nr_extents = setup_swap_extents(p, span);
+ if (nr_extents < 0)
+ return nr_extents;
nr_good_pages = p->pages;
}
if (!nr_good_pages) {
printk(KERN_WARNING "Empty swap-file\n");
+ return -EINVAL;
+ }
+
+ return nr_extents;
+}
+
+SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+{
+ struct swap_info_struct *p;
+ char *name;
+ struct file *swap_file = NULL;
+ struct address_space *mapping;
+ int i;
+ int prio;
+ int error;
+ union swap_header *swap_header;
+ int nr_extents;
+ sector_t span;
+ unsigned long maxpages;
+ unsigned char *swap_map = NULL;
+ struct page *page = NULL;
+ struct inode *inode = NULL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ p = alloc_swap_info();
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ name = getname(specialfile);
+ if (IS_ERR(name)) {
+ error = PTR_ERR(name);
+ name = NULL;
+ goto bad_swap;
+ }
+ swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+ if (IS_ERR(swap_file)) {
+ error = PTR_ERR(swap_file);
+ swap_file = NULL;
+ goto bad_swap;
+ }
+
+ p->swap_file = swap_file;
+ mapping = swap_file->f_mapping;
+
+ for (i = 0; i < nr_swapfiles; i++) {
+ struct swap_info_struct *q = swap_info[i];
+
+ if (q == p || !q->swap_file)
+ continue;
+ if (mapping == q->swap_file->f_mapping) {
+ error = -EBUSY;
+ goto bad_swap;
+ }
+ }
+
+ inode = mapping->host;
+ /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
+ error = claim_swapfile(p, inode);
+ if (unlikely(error))
+ goto bad_swap;
+
+ /*
+ * Read the swap header.
+ */
+ if (!mapping->a_ops->readpage) {
+ error = -EINVAL;
+ goto bad_swap;
+ }
+ page = read_mapping_page(mapping, 0, swap_file);
+ if (IS_ERR(page)) {
+ error = PTR_ERR(page);
+ goto bad_swap;
+ }
+ swap_header = kmap(page);
+
+ maxpages = read_swap_header(p, swap_header, inode);
+ if (unlikely(!maxpages)) {
error = -EINVAL;
goto bad_swap;
}
+ /* OK, set up the swap map and apply the bad block list */
+ swap_map = vzalloc(maxpages);
+ if (!swap_map) {
+ error = -ENOMEM;
+ goto bad_swap;
+ }
+
+ error = swap_cgroup_swapon(p->type, maxpages);
+ if (error)
+ goto bad_swap;
+
+ nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
+ maxpages, &span);
+ if (unlikely(nr_extents < 0)) {
+ error = nr_extents;
+ goto bad_swap;
+ }
+
if (p->bdev) {
if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
p->flags |= SWP_SOLIDSTATE;
@@ -2099,58 +2110,46 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
mutex_lock(&swapon_mutex);
- spin_lock(&swap_lock);
+ prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
- p->prio =
+ prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- else
- p->prio = --least_priority;
- p->swap_map = swap_map;
- p->flags |= SWP_WRITEOK;
- nr_swap_pages += nr_good_pages;
- total_swap_pages += nr_good_pages;
+ enable_swap_info(p, prio, swap_map);
printk(KERN_INFO "Adding %uk swap on %s. "
"Priority:%d extents:%d across:%lluk %s%s\n",
- nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
+ p->pages<<(PAGE_SHIFT-10), name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
(p->flags & SWP_DISCARDABLE) ? "D" : "");
- /* insert swap space into swap_list: */
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
- if (p->prio >= swap_info[i]->prio)
- break;
- prev = i;
- }
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = type;
- else
- swap_info[prev]->next = type;
- spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
+ if (S_ISREG(inode->i_mode))
+ inode->i_flags |= S_SWAPFILE;
error = 0;
goto out;
bad_swap:
- if (bdev) {
- set_blocksize(bdev, p->old_block_size);
- blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
+ set_blocksize(p->bdev, p->old_block_size);
+ blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
destroy_swap_extents(p);
- swap_cgroup_swapoff(type);
-bad_swap_2:
+ swap_cgroup_swapoff(p->type);
spin_lock(&swap_lock);
p->swap_file = NULL;
p->flags = 0;
spin_unlock(&swap_lock);
vfree(swap_map);
- if (swap_file)
+ if (swap_file) {
+ if (inode && S_ISREG(inode->i_mode)) {
+ mutex_unlock(&inode->i_mutex);
+ inode = NULL;
+ }
filp_close(swap_file, NULL);
+ }
out:
if (page && !IS_ERR(page)) {
kunmap(page);
@@ -2158,11 +2157,8 @@ out:
}
if (name)
putname(name);
- if (did_down) {
- if (!error)
- inode->i_flags |= S_SWAPFILE;
+ if (inode && S_ISREG(inode->i_mode))
mutex_unlock(&inode->i_mutex);
- }
return error;
}
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd..e53f7d02c17 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -6,7 +6,7 @@
* Released under the GPL, see the file COPYING for details.
*
* Simple token based thrashing protection, using the algorithm
- * described in: http://www.cs.wm.edu/~sjiang/token.pdf
+ * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
*
* Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
* Improved algorithm to pass token:
@@ -21,14 +21,40 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
+#include <linux/memcontrol.h>
+
+#include <trace/events/vmscan.h>
+
+#define TOKEN_AGING_INTERVAL (0xFF)
static DEFINE_SPINLOCK(swap_token_lock);
struct mm_struct *swap_token_mm;
-static unsigned int global_faults;
+struct mem_cgroup *swap_token_memcg;
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = try_get_mem_cgroup_from_mm(mm);
+ if (memcg)
+ css_put(mem_cgroup_css(memcg));
+
+ return memcg;
+}
+#else
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+ return NULL;
+}
+#endif
void grab_swap_token(struct mm_struct *mm)
{
int current_interval;
+ unsigned int old_prio = mm->token_priority;
+ static unsigned int global_faults;
+ static unsigned int last_aging;
global_faults++;
@@ -38,40 +64,92 @@ void grab_swap_token(struct mm_struct *mm)
return;
/* First come first served */
- if (swap_token_mm == NULL) {
- mm->token_priority = mm->token_priority + 2;
- swap_token_mm = mm;
- goto out;
+ if (!swap_token_mm)
+ goto replace_token;
+
+ /*
+ * Usually, we don't need priority aging because long interval faults
+ * makes priority decrease quickly. But there is one exception. If the
+ * token owner task is sleeping, it never make long interval faults.
+ * Thus, we need a priority aging mechanism instead. The requirements
+ * of priority aging are
+ * 1) An aging interval is reasonable enough long. Too short aging
+ * interval makes quick swap token lost and decrease performance.
+ * 2) The swap token owner task have to get priority aging even if
+ * it's under sleep.
+ */
+ if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
+ swap_token_mm->token_priority /= 2;
+ last_aging = global_faults;
}
- if (mm != swap_token_mm) {
- if (current_interval < mm->last_interval)
- mm->token_priority++;
- else {
- if (likely(mm->token_priority > 0))
- mm->token_priority--;
- }
- /* Check if we deserve the token */
- if (mm->token_priority > swap_token_mm->token_priority) {
- mm->token_priority += 2;
- swap_token_mm = mm;
- }
- } else {
- /* Token holder came in again! */
+ if (mm == swap_token_mm) {
mm->token_priority += 2;
+ goto update_priority;
+ }
+
+ if (current_interval < mm->last_interval)
+ mm->token_priority++;
+ else {
+ if (likely(mm->token_priority > 0))
+ mm->token_priority--;
}
+ /* Check if we deserve the token */
+ if (mm->token_priority > swap_token_mm->token_priority)
+ goto replace_token;
+
+update_priority:
+ trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
+
out:
mm->faultstamp = global_faults;
mm->last_interval = current_interval;
spin_unlock(&swap_token_lock);
+ return;
+
+replace_token:
+ mm->token_priority += 2;
+ trace_replace_swap_token(swap_token_mm, mm);
+ swap_token_mm = mm;
+ swap_token_memcg = swap_token_memcg_from_mm(mm);
+ last_aging = global_faults;
+ goto out;
}
/* Called on process exit. */
void __put_swap_token(struct mm_struct *mm)
{
spin_lock(&swap_token_lock);
- if (likely(mm == swap_token_mm))
+ if (likely(mm == swap_token_mm)) {
+ trace_put_swap_token(swap_token_mm);
swap_token_mm = NULL;
+ swap_token_memcg = NULL;
+ }
spin_unlock(&swap_token_lock);
}
+
+static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
+{
+ if (!a)
+ return true;
+ if (!b)
+ return true;
+ if (a == b)
+ return true;
+ return false;
+}
+
+void disable_swap_token(struct mem_cgroup *memcg)
+{
+ /* memcg reclaim don't disable unrelated mm token. */
+ if (match_memcg(memcg, swap_token_memcg)) {
+ spin_lock(&swap_token_lock);
+ if (match_memcg(memcg, swap_token_memcg)) {
+ trace_disable_swap_token(swap_token_mm);
+ swap_token_mm = NULL;
+ swap_token_memcg = NULL;
+ }
+ spin_unlock(&swap_token_lock);
+ }
+}
diff --git a/mm/truncate.c b/mm/truncate.c
index d64296be00d..b40ac6d4e86 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,6 +19,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
+#include <linux/cleancache.h>
#include "internal.h"
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+ cleancache_flush_page(page->mapping, page);
if (page_has_private(page))
do_invalidatepage(page, partial);
}
@@ -106,9 +108,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
cancel_dirty_page(page, PAGE_CACHE_SIZE);
clear_page_mlock(page);
- remove_from_page_cache(page);
ClearPageMappedToDisk(page);
- page_cache_release(page); /* pagecache ref */
+ delete_from_page_cache(page);
return 0;
}
@@ -198,9 +199,6 @@ int invalidate_inode_page(struct page *page)
* The first pass will remove most pages, so the search cost of the second pass
* is low.
*
- * When looking at page->index outside the page lock we need to be careful to
- * copy it into a local to avoid races (it could change at any time).
- *
* We pass down the cache-hot hint to the page freeing code. Even if the
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
@@ -209,12 +207,13 @@ void truncate_inode_pages_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
- pgoff_t end;
const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
struct pagevec pvec;
- pgoff_t next;
+ pgoff_t index;
+ pgoff_t end;
int i;
+ cleancache_flush_inode(mapping);
if (mapping->nrpages == 0)
return;
@@ -222,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping,
end = (lend >> PAGE_CACHE_SHIFT);
pagevec_init(&pvec, 0);
- next = start;
- while (next <= end &&
- pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ index = start;
+ while (index <= end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- pgoff_t page_index = page->index;
- if (page_index > end) {
- next = page_index;
+ /* We rely upon deletion not changing page->index */
+ index = page->index;
+ if (index > end)
break;
- }
- if (page_index > next)
- next = page_index;
- next++;
if (!trylock_page(page))
continue;
+ WARN_ON(page->index != index);
if (PageWriteback(page)) {
unlock_page(page);
continue;
@@ -250,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
+ index++;
}
if (partial) {
@@ -262,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping,
}
}
- next = start;
+ index = start;
for ( ; ; ) {
cond_resched();
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
- if (next == start)
+ if (!pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ if (index == start)
break;
- next = start;
+ index = start;
continue;
}
- if (pvec.pages[0]->index > end) {
+ if (index == start && pvec.pages[0]->index > end) {
pagevec_release(&pvec);
break;
}
@@ -279,19 +277,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- if (page->index > end)
+ /* We rely upon deletion not changing page->index */
+ index = page->index;
+ if (index > end)
break;
+
lock_page(page);
+ WARN_ON(page->index != index);
wait_on_page_writeback(page);
truncate_inode_page(mapping, page);
- if (page->index > next)
- next = page->index;
- next++;
unlock_page(page);
}
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
+ index++;
}
+ cleancache_flush_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -301,6 +302,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
* @lstart: offset from which to truncate
*
* Called under (and serialised by) inode->i_mutex.
+ *
+ * Note: When this function returns, there can be a page in the process of
+ * deletion (inside __delete_from_page_cache()) in the specified range. Thus
+ * mapping->nrpages can be non-zero when this function returns even after
+ * truncation of the whole mapping.
*/
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
@@ -322,48 +328,53 @@ EXPORT_SYMBOL(truncate_inode_pages);
* pagetables.
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+ pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
- pgoff_t next = start;
- unsigned long ret = 0;
+ pgoff_t index = start;
+ unsigned long ret;
+ unsigned long count = 0;
int i;
+ /*
+ * Note: this function may get called on a shmem/tmpfs mapping:
+ * pagevec_lookup() might then return 0 prematurely (because it
+ * got a gangful of swap entries); but it's hardly worth worrying
+ * about - it can rarely have anything to free from such a mapping
+ * (most pages are dirty), and already skips over any difficulties.
+ */
+
pagevec_init(&pvec, 0);
- while (next <= end &&
- pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (index <= end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- pgoff_t index;
- int lock_failed;
-
- lock_failed = !trylock_page(page);
- /*
- * We really shouldn't be looking at the ->index of an
- * unlocked page. But we're not allowed to lock these
- * pages. So we rely upon nobody altering the ->index
- * of this (pinned-by-us) page.
- */
+ /* We rely upon deletion not changing page->index */
index = page->index;
- if (index > next)
- next = index;
- next++;
- if (lock_failed)
- continue;
-
- ret += invalidate_inode_page(page);
+ if (index > end)
+ break;
+ if (!trylock_page(page))
+ continue;
+ WARN_ON(page->index != index);
+ ret = invalidate_inode_page(page);
unlock_page(page);
- if (next > end)
- break;
+ /*
+ * Invalidation is a hint that the page is no longer
+ * of interest and try to speed up its reclaim.
+ */
+ if (!ret)
+ deactivate_page(page);
+ count += ret;
}
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
+ index++;
}
- return ret;
+ return count;
}
EXPORT_SYMBOL(invalidate_mapping_pages);
@@ -389,7 +400,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
clear_page_mlock(page);
BUG_ON(page_has_private(page));
- __remove_from_page_cache(page);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
@@ -427,36 +438,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
- pgoff_t next;
+ pgoff_t index;
int i;
int ret = 0;
int ret2 = 0;
int did_range_unmap = 0;
- int wrapped = 0;
+ cleancache_flush_inode(mapping);
pagevec_init(&pvec, 0);
- next = start;
- while (next <= end && !wrapped &&
- pagevec_lookup(&pvec, mapping, next,
- min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ index = start;
+ while (index <= end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- pgoff_t page_index;
+
+ /* We rely upon deletion not changing page->index */
+ index = page->index;
+ if (index > end)
+ break;
lock_page(page);
+ WARN_ON(page->index != index);
if (page->mapping != mapping) {
unlock_page(page);
continue;
}
- page_index = page->index;
- next = page_index + 1;
- if (next == 0)
- wrapped = 1;
- if (page_index > end) {
- unlock_page(page);
- break;
- }
wait_on_page_writeback(page);
if (page_mapped(page)) {
if (!did_range_unmap) {
@@ -464,9 +471,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* Zap the rest of the file in one hit.
*/
unmap_mapping_range(mapping,
- (loff_t)page_index<<PAGE_CACHE_SHIFT,
- (loff_t)(end - page_index + 1)
- << PAGE_CACHE_SHIFT,
+ (loff_t)index << PAGE_CACHE_SHIFT,
+ (loff_t)(1 + end - index)
+ << PAGE_CACHE_SHIFT,
0);
did_range_unmap = 1;
} else {
@@ -474,8 +481,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* Just zap this page
*/
unmap_mapping_range(mapping,
- (loff_t)page_index<<PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, 0);
+ (loff_t)index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
}
}
BUG_ON(page_mapped(page));
@@ -491,7 +498,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
+ index++;
}
+ cleancache_flush_inode(mapping);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
@@ -514,8 +523,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
/**
* truncate_pagecache - unmap and remove pagecache that has been truncated
* @inode: inode
- * @old: old file offset
- * @new: new file offset
+ * @oldsize: old file size
+ * @newsize: new file size
*
* inode's new i_size must already be written before truncate_pagecache
* is called.
@@ -527,9 +536,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
* situations such as writepage being called for a page that has already
* had its underlying blocks deallocated.
*/
-void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
+void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
{
struct address_space *mapping = inode->i_mapping;
+ loff_t holebegin = round_up(newsize, PAGE_SIZE);
/*
* unmap_mapping_range is called twice, first simply for
@@ -540,9 +550,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
* truncate_inode_pages finishes, hence the second
* unmap_mapping_range call must be made for correctness.
*/
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, new);
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+ unmap_mapping_range(mapping, holebegin, 0, 1);
+ truncate_inode_pages(mapping, newsize);
+ unmap_mapping_range(mapping, holebegin, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);
@@ -572,22 +582,47 @@ EXPORT_SYMBOL(truncate_setsize);
/**
* vmtruncate - unmap mappings "freed" by truncate() syscall
* @inode: inode of the file used
- * @offset: file offset to start truncating
+ * @newsize: file offset to start truncating
*
* This function is deprecated and truncate_setsize or truncate_pagecache
* should be used instead, together with filesystem specific block truncation.
*/
-int vmtruncate(struct inode *inode, loff_t offset)
+int vmtruncate(struct inode *inode, loff_t newsize)
{
int error;
- error = inode_newsize_ok(inode, offset);
+ error = inode_newsize_ok(inode, newsize);
if (error)
return error;
- truncate_setsize(inode, offset);
+ truncate_setsize(inode, newsize);
if (inode->i_op->truncate)
inode->i_op->truncate(inode);
return 0;
}
EXPORT_SYMBOL(vmtruncate);
+
+int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ struct address_space *mapping = inode->i_mapping;
+ loff_t holebegin = round_up(lstart, PAGE_SIZE);
+ loff_t holelen = 1 + lend - holebegin;
+
+ /*
+ * If the underlying filesystem is not going to provide
+ * a way to truncate a range of blocks (punch a hole) -
+ * we should return failure right now.
+ */
+ if (!inode->i_op->truncate_range)
+ return -ENOSYS;
+
+ mutex_lock(&inode->i_mutex);
+ inode_dio_wait(inode);
+ unmap_mapping_range(mapping, holebegin, holelen, 1);
+ inode->i_op->truncate_range(inode, lstart, lend);
+ /* unmap again to remove racily COWed private pages */
+ unmap_mapping_range(mapping, holebegin, holelen, 1);
+ mutex_unlock(&inode->i_mutex);
+
+ return 0;
+}
diff --git a/mm/util.c b/mm/util.c
index f126975ef23..88ea1bd661c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,8 @@
#include <linux/sched.h>
#include <asm/uaccess.h>
+#include "internal.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/kmem.h>
@@ -215,6 +217,28 @@ char *strndup_user(const char __user *s, long n)
}
EXPORT_SYMBOL(strndup_user);
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node *rb_parent)
+{
+ struct vm_area_struct *next;
+
+ vma->vm_prev = prev;
+ if (prev) {
+ next = prev->vm_next;
+ prev->vm_next = vma;
+ } else {
+ mm->mmap = vma;
+ if (rb_parent)
+ next = rb_entry(rb_parent,
+ struct vm_area_struct, vm_rb);
+ else
+ next = NULL;
+ }
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
+}
+
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
@@ -227,7 +251,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
- * If the architecture not support this fucntion, simply return with no
+ * If the architecture not support this function, simply return with no
* page pinned
*/
int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f9b166732e7..7ef0903058e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -26,7 +26,7 @@
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -261,8 +261,15 @@ struct vmap_area {
};
static DEFINE_SPINLOCK(vmap_area_lock);
-static struct rb_root vmap_area_root = RB_ROOT;
static LIST_HEAD(vmap_area_list);
+static struct rb_root vmap_area_root = RB_ROOT;
+
+/* The vmap cache globals are protected by vmap_area_lock */
+static struct rb_node *free_vmap_cache;
+static unsigned long cached_hole_size;
+static unsigned long cached_vstart;
+static unsigned long cached_align;
+
static unsigned long vmap_area_pcpu_hole;
static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
struct rb_node *n;
unsigned long addr;
int purged = 0;
+ struct vmap_area *first;
BUG_ON(!size);
BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(!is_power_of_2(align));
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
@@ -341,91 +350,127 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-ENOMEM);
retry:
- addr = ALIGN(vstart, align);
-
spin_lock(&vmap_area_lock);
- if (addr + size - 1 < addr)
- goto overflow;
+ /*
+ * Invalidate cache if we have more permissive parameters.
+ * cached_hole_size notes the largest hole noticed _below_
+ * the vmap_area cached in free_vmap_cache: if size fits
+ * into that hole, we want to scan from vstart to reuse
+ * the hole instead of allocating above free_vmap_cache.
+ * Note that __free_vmap_area may update free_vmap_cache
+ * without updating cached_hole_size or cached_align.
+ */
+ if (!free_vmap_cache ||
+ size < cached_hole_size ||
+ vstart < cached_vstart ||
+ align < cached_align) {
+nocache:
+ cached_hole_size = 0;
+ free_vmap_cache = NULL;
+ }
+ /* record if we encounter less permissive parameters */
+ cached_vstart = vstart;
+ cached_align = align;
+
+ /* find starting point for our search */
+ if (free_vmap_cache) {
+ first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+ addr = ALIGN(first->va_end, align);
+ if (addr < vstart)
+ goto nocache;
+ if (addr + size - 1 < addr)
+ goto overflow;
- /* XXX: could have a last_hole cache */
- n = vmap_area_root.rb_node;
- if (n) {
- struct vmap_area *first = NULL;
+ } else {
+ addr = ALIGN(vstart, align);
+ if (addr + size - 1 < addr)
+ goto overflow;
+
+ n = vmap_area_root.rb_node;
+ first = NULL;
- do {
+ while (n) {
struct vmap_area *tmp;
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_end >= addr) {
- if (!first && tmp->va_start < addr + size)
- first = tmp;
- n = n->rb_left;
- } else {
first = tmp;
+ if (tmp->va_start <= addr)
+ break;
+ n = n->rb_left;
+ } else
n = n->rb_right;
- }
- } while (n);
+ }
if (!first)
goto found;
-
- if (first->va_end < addr) {
- n = rb_next(&first->rb_node);
- if (n)
- first = rb_entry(n, struct vmap_area, rb_node);
- else
- goto found;
- }
-
- while (addr + size > first->va_start && addr + size <= vend) {
- addr = ALIGN(first->va_end + PAGE_SIZE, align);
- if (addr + size - 1 < addr)
- goto overflow;
-
- n = rb_next(&first->rb_node);
- if (n)
- first = rb_entry(n, struct vmap_area, rb_node);
- else
- goto found;
- }
}
-found:
- if (addr + size > vend) {
-overflow:
- spin_unlock(&vmap_area_lock);
- if (!purged) {
- purge_vmap_area_lazy();
- purged = 1;
- goto retry;
- }
- if (printk_ratelimit())
- printk(KERN_WARNING
- "vmap allocation for size %lu failed: "
- "use vmalloc=<size> to increase size.\n", size);
- kfree(va);
- return ERR_PTR(-EBUSY);
+
+ /* from the starting point, walk areas until a suitable hole is found */
+ while (addr + size > first->va_start && addr + size <= vend) {
+ if (addr + cached_hole_size < first->va_start)
+ cached_hole_size = first->va_start - addr;
+ addr = ALIGN(first->va_end, align);
+ if (addr + size - 1 < addr)
+ goto overflow;
+
+ n = rb_next(&first->rb_node);
+ if (n)
+ first = rb_entry(n, struct vmap_area, rb_node);
+ else
+ goto found;
}
- BUG_ON(addr & (align-1));
+found:
+ if (addr + size > vend)
+ goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
__insert_vmap_area(va);
+ free_vmap_cache = &va->rb_node;
spin_unlock(&vmap_area_lock);
- return va;
-}
+ BUG_ON(va->va_start & (align-1));
+ BUG_ON(va->va_start < vstart);
+ BUG_ON(va->va_end > vend);
-static void rcu_free_va(struct rcu_head *head)
-{
- struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
+ return va;
+overflow:
+ spin_unlock(&vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = 1;
+ goto retry;
+ }
+ if (printk_ratelimit())
+ printk(KERN_WARNING
+ "vmap allocation for size %lu failed: "
+ "use vmalloc=<size> to increase size.\n", size);
kfree(va);
+ return ERR_PTR(-EBUSY);
}
static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+
+ if (free_vmap_cache) {
+ if (va->va_end < cached_vstart) {
+ free_vmap_cache = NULL;
+ } else {
+ struct vmap_area *cache;
+ cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+ if (va->va_start <= cache->va_start) {
+ free_vmap_cache = rb_prev(&va->rb_node);
+ /*
+ * We don't try to update cached_hole_size or
+ * cached_align, but it won't go very wrong.
+ */
+ }
+ }
+ }
rb_erase(&va->rb_node, &vmap_area_root);
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);
@@ -439,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va)
if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
- call_rcu(&va->rcu_head, rcu_free_va);
+ kfree_rcu(va, rcu_head);
}
/*
@@ -680,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
-#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
- VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
- VMALLOC_PAGES / NR_CPUS / 16))
+#define VMAP_BBMAP_BITS \
+ VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
+ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
+ VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
@@ -785,13 +831,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
return vb;
}
-static void rcu_free_vb(struct rcu_head *head)
-{
- struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
-
- kfree(vb);
-}
-
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
@@ -804,7 +843,7 @@ static void free_vmap_block(struct vmap_block *vb)
BUG_ON(tmp != vb);
free_vmap_area_noflush(vb->va);
- call_rcu(&vb->rcu_head, rcu_free_vb);
+ kfree_rcu(vb, rcu_head);
}
static void purge_fragmented_blocks(int cpu)
@@ -1482,6 +1521,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node, void *caller)
{
+ const int order = 0;
struct page **pages;
unsigned int nr_pages, array_size, i;
gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
@@ -1508,11 +1548,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
+ gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
if (node < 0)
- page = alloc_page(gfp_mask);
+ page = alloc_page(tmp_mask);
else
- page = alloc_pages_node(node, gfp_mask, 0);
+ page = alloc_pages_node(node, tmp_mask, order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
@@ -1527,6 +1568,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
+ warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, "
+ "allocated %ld of %ld bytes\n",
+ (area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
@@ -1951,8 +1995,6 @@ finished:
* should know vmalloc() area is valid and can use memcpy().
* This is for routines which have to access vmalloc area without
* any informaion, as /dev/kmem.
- *
- * The caller should guarantee KM_USER1 is not used.
*/
long vwrite(char *buf, char *addr, unsigned long count)
@@ -2098,10 +2140,6 @@ struct vm_struct *alloc_vm_area(size_t size)
return NULL;
}
- /* Make sure the pagetables are constructed in process kernel
- mappings */
- vmalloc_sync_all();
-
return area;
}
EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6771ea70bfe..7ef69124fa3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,8 @@
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/oom.h>
+#include <linux/prefetch.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -93,8 +95,6 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
int may_swap;
- int swappiness;
-
int order;
/*
@@ -105,6 +105,7 @@ struct scan_control {
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
+ struct memcg_scanrecord *memcg_record;
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -171,7 +172,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
struct scan_control *sc, enum lru_list lru)
{
if (!scanning_global_lru(sc))
- return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+ return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
+ zone_to_nid(zone), zone_idx(zone), BIT(lru));
return zone_page_state(zone, NR_LRU_BASE + lru);
}
@@ -200,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(unregister_shrinker);
+static inline int do_shrinker_shrink(struct shrinker *shrinker,
+ struct shrink_control *sc,
+ unsigned long nr_to_scan)
+{
+ sc->nr_to_scan = nr_to_scan;
+ return (*shrinker->shrink)(shrinker, sc);
+}
+
#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
@@ -220,67 +230,114 @@ EXPORT_SYMBOL(unregister_shrinker);
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+unsigned long shrink_slab(struct shrink_control *shrink,
+ unsigned long nr_pages_scanned,
+ unsigned long lru_pages)
{
struct shrinker *shrinker;
unsigned long ret = 0;
- if (scanned == 0)
- scanned = SWAP_CLUSTER_MAX;
+ if (nr_pages_scanned == 0)
+ nr_pages_scanned = SWAP_CLUSTER_MAX;
- if (!down_read_trylock(&shrinker_rwsem))
- return 1; /* Assume we'll be able to shrink next time */
+ if (!down_read_trylock(&shrinker_rwsem)) {
+ /* Assume we'll be able to shrink next time */
+ ret = 1;
+ goto out;
+ }
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
unsigned long max_pass;
+ int shrink_ret = 0;
+ long nr;
+ long new_nr;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
- max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
- delta = (4 * scanned) / shrinker->seeks;
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ do {
+ nr = shrinker->nr;
+ } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+
+ total_scan = nr;
+ max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
- shrinker->nr += delta;
- if (shrinker->nr < 0) {
+ total_scan += delta;
+ if (total_scan < 0) {
printk(KERN_ERR "shrink_slab: %pF negative objects to "
"delete nr=%ld\n",
- shrinker->shrink, shrinker->nr);
- shrinker->nr = max_pass;
+ shrinker->shrink, total_scan);
+ total_scan = max_pass;
}
/*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+
+ /*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (shrinker->nr > max_pass * 2)
- shrinker->nr = max_pass * 2;
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;
- total_scan = shrinker->nr;
- shrinker->nr = 0;
+ trace_mm_shrink_slab_start(shrinker, shrink, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
- while (total_scan >= SHRINK_BATCH) {
- long this_scan = SHRINK_BATCH;
- int shrink_ret;
+ while (total_scan >= batch_size) {
int nr_before;
- nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
- shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
- gfp_mask);
+ nr_before = do_shrinker_shrink(shrinker, shrink, 0);
+ shrink_ret = do_shrinker_shrink(shrinker, shrink,
+ batch_size);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
- count_vm_events(SLABS_SCANNED, this_scan);
- total_scan -= this_scan;
+ count_vm_events(SLABS_SCANNED, batch_size);
+ total_scan -= batch_size;
cond_resched();
}
- shrinker->nr += total_scan;
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ do {
+ nr = shrinker->nr;
+ new_nr = total_scan + nr;
+ if (total_scan <= 0)
+ break;
+ } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+
+ trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
up_read(&shrinker_rwsem);
+out:
+ cond_resched();
return ret;
}
@@ -358,7 +415,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
- lock_page_nosync(page);
+ lock_page(page);
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);
unlock_page(page);
@@ -514,7 +571,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
freepage = mapping->a_ops->freepage;
- __remove_from_page_cache(page);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
@@ -936,7 +993,7 @@ keep_lumpy:
* back off and wait for congestion to clear because further reclaim
* will encounter the same problem
*/
- if (nr_dirty == nr_congested && nr_dirty != 0)
+ if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
zone_set_flag(zone, ZONE_CONGESTED);
free_page_list(&free_pages);
@@ -1065,7 +1122,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* surrounding the tag page. Only take those pages of
* the same active state as that tag page. We may safely
* round the target page pfn down to the requested order
- * as the mem_map is guarenteed valid out to MAX_ORDER,
+ * as the mem_map is guaranteed valid out to MAX_ORDER,
* where that page is in a different zone we will detect
* it from its zone id and abort this block scan.
*/
@@ -1108,8 +1165,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
nr_lumpy_dirty++;
scan++;
} else {
- /* the page is freed already. */
- if (!page_count(cursor_page))
+ /*
+ * Check if the page is freed already.
+ *
+ * We can't use page_count() as that
+ * requires compound_head and we don't
+ * have a pin on the page here. If a
+ * page is tail, we may or may not
+ * have isolated the head, so assume
+ * it's not free, it'd be tricky to
+ * track the head status without a
+ * page pin.
+ */
+ if (!PageTail(cursor_page) &&
+ !atomic_read(&cursor_page->_count))
continue;
break;
}
@@ -1200,13 +1269,16 @@ int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
+ VM_BUG_ON(!page_count(page));
+
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page) && get_page_unless_zero(page)) {
+ if (PageLRU(page)) {
int lru = page_lru(page);
ret = 0;
+ get_page(page);
ClearPageLRU(page);
del_page_from_lru_list(zone, page, lru);
@@ -1277,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
reclaim_stat->recent_rotated[file] += numpages;
+ if (!scanning_global_lru(sc))
+ sc->memcg_record->nr_rotated[file] += numpages;
}
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1320,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
reclaim_stat->recent_scanned[0] += *nr_anon;
reclaim_stat->recent_scanned[1] += *nr_file;
+ if (!scanning_global_lru(sc)) {
+ sc->memcg_record->nr_scanned[0] += *nr_anon;
+ sc->memcg_record->nr_scanned[1] += *nr_file;
+ }
}
/*
@@ -1433,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
+ if (!scanning_global_lru(sc))
+ sc->memcg_record->nr_freed[file] += nr_reclaimed;
+
local_irq_disable();
if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1532,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
}
reclaim_stat->recent_scanned[file] += nr_taken;
+ if (!scanning_global_lru(sc))
+ sc->memcg_record->nr_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, pgscanned);
if (file)
@@ -1583,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* get_scan_ratio.
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
+ if (!scanning_global_lru(sc))
+ sc->memcg_record->nr_rotated[file] += nr_rotated;
move_active_pages_to_lru(zone, &l_active,
LRU_ACTIVE + file * LRU_FILE);
@@ -1698,24 +1783,11 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
}
-/*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
- unsigned long *nr_saved_scan)
+static int vmscan_swappiness(struct scan_control *sc)
{
- unsigned long nr;
-
- *nr_saved_scan += nr_to_scan;
- nr = *nr_saved_scan;
-
- if (nr >= SWAP_CLUSTER_MAX)
- *nr_saved_scan = 0;
- else
- nr = 0;
-
- return nr;
+ if (scanning_global_lru(sc))
+ return vm_swappiness;
+ return mem_cgroup_swappiness(sc->mem_cgroup);
}
/*
@@ -1736,6 +1808,23 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
u64 fraction[2], denominator;
enum lru_list l;
int noswap = 0;
+ int force_scan = 0;
+ unsigned long nr_force_scan[2];
+
+
+ anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+ file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
+ if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+ /* kswapd does zone balancing and need to scan this zone */
+ if (scanning_global_lru(sc) && current_is_kswapd())
+ force_scan = 1;
+ /* memcg may have small limit and need to avoid priority drop */
+ if (!scanning_global_lru(sc))
+ force_scan = 1;
+ }
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1743,14 +1832,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = 0;
fraction[1] = 1;
denominator = 1;
+ nr_force_scan[0] = 0;
+ nr_force_scan[1] = SWAP_CLUSTER_MAX;
goto out;
}
- anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
- file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
if (scanning_global_lru(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
@@ -1759,6 +1845,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
+ nr_force_scan[0] = SWAP_CLUSTER_MAX;
+ nr_force_scan[1] = 0;
goto out;
}
}
@@ -1767,8 +1855,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = sc->swappiness;
- file_prio = 200 - sc->swappiness;
+ anon_prio = vmscan_swappiness(sc);
+ file_prio = 200 - vmscan_swappiness(sc);
/*
* OK, so we have swap space and a fair amount of page cache
@@ -1807,6 +1895,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp + 1;
+ if (force_scan) {
+ unsigned long scan = SWAP_CLUSTER_MAX;
+ nr_force_scan[0] = div64_u64(scan * ap, denominator);
+ nr_force_scan[1] = div64_u64(scan * fp, denominator);
+ }
out:
for_each_evictable_lru(l) {
int file = is_file_lru(l);
@@ -1817,8 +1910,19 @@ out:
scan >>= priority;
scan = div64_u64(scan * fraction[file], denominator);
}
- nr[l] = nr_scan_try_batch(scan,
- &reclaim_stat->nr_saved_scan[l]);
+
+ /*
+ * If zone is small or memcg is small, nr[l] can be 0.
+ * This results no-scan on this priority and priority drop down.
+ * For global direct reclaim, it can visit next zone and tend
+ * not to have problems. For global kswapd, it's for zone
+ * balancing and it need to scan a small amounts. When using
+ * memcg, priority drop can cause big latency. So, it's better
+ * to scan small amount. See may_noscan above.
+ */
+ if (!scan && force_scan)
+ scan = nr_force_scan[file];
+ nr[l] = scan;
}
}
@@ -1963,6 +2067,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
{
struct zoneref *z;
struct zone *zone;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1977,6 +2083,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ /*
+ * This steals pages from memory cgroups over softlimit
+ * and returns the number of reclaimed pages and
+ * scanned pages. This works for global memory pressure
+ * and balancing, not for a memcg's limit.
+ */
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ sc->order, sc->gfp_mask,
+ &nr_soft_scanned);
+ sc->nr_reclaimed += nr_soft_reclaimed;
+ sc->nr_scanned += nr_soft_scanned;
+ /* need some check for avoid more shrink_zone() */
}
shrink_zone(priority, zone, sc);
@@ -1988,17 +2107,12 @@ static bool zone_reclaimable(struct zone *zone)
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
-/*
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. It can't handle OOM during hibernation.
- * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
- */
+/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
- bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2006,13 +2120,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (zone_reclaimable(zone)) {
- all_unreclaimable = false;
- break;
- }
+ if (!zone->all_unreclaimable)
+ return false;
}
- return all_unreclaimable;
+ return true;
}
/*
@@ -2032,7 +2144,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
* else, the number of pages reclaimed
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
- struct scan_control *sc)
+ struct scan_control *sc,
+ struct shrink_control *shrink)
{
int priority;
unsigned long total_scanned = 0;
@@ -2050,7 +2163,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
if (!priority)
- disable_swap_token();
+ disable_swap_token(sc->mem_cgroup);
shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
@@ -2066,7 +2179,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
lru_pages += zone_reclaimable_pages(zone);
}
- shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+ shrink_slab(shrink, sc->nr_scanned, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -2108,6 +2221,14 @@ out:
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
+ /*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
+ * check.
+ */
+ if (oom_killer_disabled)
+ return 0;
+
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
@@ -2125,17 +2246,19 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_unmap = 1,
.may_swap = 1,
- .swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
.nodemask = nodemask,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
@@ -2145,19 +2268,23 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
- gfp_t gfp_mask, bool noswap,
- unsigned int swappiness,
- struct zone *zone)
+ gfp_t gfp_mask, bool noswap,
+ struct zone *zone,
+ struct memcg_scanrecord *rec,
+ unsigned long *scanned)
{
struct scan_control sc = {
+ .nr_scanned = 0,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
- .swappiness = swappiness,
.order = 0,
.mem_cgroup = mem,
+ .memcg_record = rec,
};
+ unsigned long start, end;
+
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2165,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
sc.may_writepage,
sc.gfp_mask);
+ start = sched_clock();
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
@@ -2173,6 +2301,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
* the priority and make it zero.
*/
shrink_zone(0, zone, &sc);
+ end = sched_clock();
+
+ if (rec)
+ rec->elapsed += end - start;
+ *scanned = sc.nr_scanned;
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2182,30 +2315,46 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask,
bool noswap,
- unsigned int swappiness)
+ struct memcg_scanrecord *rec)
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ unsigned long start, end;
+ int nid;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .swappiness = swappiness,
.order = 0,
.mem_cgroup = mem_cont,
+ .memcg_record = rec,
.nodemask = NULL, /* we don't care the placement */
+ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+ };
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
};
- sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
- (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+ start = sched_clock();
+ /*
+ * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
+ * take care of from where we get pages. So the node where we start the
+ * scan does not need to be the current node.
+ */
+ nid = mem_cgroup_select_victim_node(mem_cont);
+
+ zonelist = NODE_DATA(nid)->node_zonelists;
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
sc.gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+ end = sched_clock();
+ if (rec)
+ rec->elapsed += end - start;
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -2224,7 +2373,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
* o a 16M DMA zone that is balanced will not balance a zone on any
* reasonable sized machine
* o On all other machines, the top zone must be at least a reasonable
- * precentage of the middle zones. For example, on 32-bit x86, highmem
+ * percentage of the middle zones. For example, on 32-bit x86, highmem
* would need to be at least 256M for it to be balance a whole node.
* Similarly, on x86-64 the Normal zone would need to be at least 1G
* to balance a node on its own. These seemed like reasonable ratios.
@@ -2238,7 +2387,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
for (i = 0; i <= classzone_idx; i++)
present_pages += pgdat->node_zones[i].present_pages;
- return balanced_pages > (present_pages >> 2);
+ /* A special case here: if zone has no page, we think it's balanced */
+ return balanced_pages >= (present_pages >> 2);
}
/* is kswapd sleeping prematurely? */
@@ -2254,7 +2404,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
return true;
/* Check the watermark levels */
- for (i = 0; i < pgdat->nr_zones; i++) {
+ for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
@@ -2272,7 +2422,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
}
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
- classzone_idx, 0))
+ i, 0))
all_zones_ok = false;
else
balanced += zone->present_pages;
@@ -2284,7 +2434,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
* must be balanced
*/
if (order)
- return pgdat_balanced(pgdat, balanced, classzone_idx);
+ return !pgdat_balanced(pgdat, balanced, classzone_idx);
else
return !all_zones_ok;
}
@@ -2320,6 +2470,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
struct reclaim_state *reclaim_state = current->reclaim_state;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
@@ -2329,10 +2481,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* we want to put equal scanning pressure on each zone.
*/
.nr_to_reclaim = ULONG_MAX,
- .swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
loop_again:
total_scanned = 0;
sc.nr_reclaimed = 0;
@@ -2345,7 +2499,7 @@ loop_again:
/* The swap token gets in the way of swapout... */
if (!priority)
- disable_swap_token();
+ disable_swap_token(NULL);
all_zones_ok = 1;
balanced = 0;
@@ -2374,7 +2528,6 @@ loop_again:
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
- *classzone_idx = i;
break;
}
}
@@ -2397,9 +2550,9 @@ loop_again:
* cause too much scanning of the lower zones.
*/
for (i = 0; i <= end_zone; i++) {
- int compaction;
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
+ unsigned long balance_gap;
if (!populated_zone(zone))
continue;
@@ -2409,45 +2562,42 @@ loop_again:
sc.nr_scanned = 0;
+ nr_soft_scanned = 0;
/*
* Call soft limit reclaim before calling shrink_zone.
- * For now we ignore the return value
*/
- mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ order, sc.gfp_mask,
+ &nr_soft_scanned);
+ sc.nr_reclaimed += nr_soft_reclaimed;
+ total_scanned += nr_soft_scanned;
/*
- * We put equal pressure on every zone, unless one
- * zone has way too many pages free already.
+ * We put equal pressure on every zone, unless
+ * one zone has way too many pages free
+ * already. The "too many pages" is defined
+ * as the high wmark plus a "gap" where the
+ * gap is either the low watermark or 1%
+ * of the zone, whichever is smaller.
*/
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages +
+ KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
if (!zone_watermark_ok_safe(zone, order,
- 8*high_wmark_pages(zone), end_zone, 0))
+ high_wmark_pages(zone) + balance_gap,
+ end_zone, 0)) {
shrink_zone(priority, zone, &sc);
- reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_scanned += sc.nr_scanned;
-
- compaction = 0;
- if (order &&
- zone_watermark_ok(zone, 0,
- high_wmark_pages(zone),
- end_zone, 0) &&
- !zone_watermark_ok(zone, order,
- high_wmark_pages(zone),
- end_zone, 0)) {
- compact_zone_order(zone,
- order,
- sc.gfp_mask, false,
- COMPACT_MODE_KSWAPD);
- compaction = 1;
+
+ reclaim_state->reclaimed_slab = 0;
+ nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_scanned += sc.nr_scanned;
+
+ if (nr_slab == 0 && !zone_reclaimable(zone))
+ zone->all_unreclaimable = 1;
}
- if (zone->all_unreclaimable)
- continue;
- if (!compaction && nr_slab == 0 &&
- !zone_reclaimable(zone))
- zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
@@ -2457,6 +2607,12 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
+ if (zone->all_unreclaimable) {
+ if (end_zone && end_zone == i)
+ end_zone--;
+ continue;
+ }
+
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
@@ -2635,8 +2791,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
static int kswapd(void *p)
{
- unsigned long order;
- int classzone_idx;
+ unsigned long order, new_order;
+ int classzone_idx, new_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -2666,17 +2822,23 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- order = 0;
- classzone_idx = MAX_NR_ZONES - 1;
+ order = new_order = 0;
+ classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
for ( ; ; ) {
- unsigned long new_order;
- int new_classzone_idx;
int ret;
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ /*
+ * If the last balance_pgdat was unsuccessful it's unlikely a
+ * new request of a similar or harder type will succeed soon
+ * so consider going to sleep on the basis we reclaimed at
+ */
+ if (classzone_idx >= new_classzone_idx && order == new_order) {
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
+ }
+
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
@@ -2689,7 +2851,7 @@ static int kswapd(void *p)
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
}
ret = try_to_freeze();
@@ -2788,10 +2950,12 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.may_writepage = 1,
.nr_to_reclaim = nr_to_reclaim,
.hibernation_mode = 1,
- .swappiness = vm_swappiness,
.order = 0,
};
- struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
@@ -2800,7 +2964,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
@@ -2972,9 +3136,11 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.nr_to_reclaim = max_t(unsigned long, nr_pages,
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
- .swappiness = vm_swappiness,
.order = order,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
unsigned long nr_slab_pages0, nr_slab_pages1;
cond_resched();
@@ -3016,7 +3182,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
unsigned long lru_pages = zone_reclaimable_pages(zone);
/* No reclaimable slab or very low memory pressure */
- if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+ if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
break;
/* Freed enough memory */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0c3b5048773..20c18b7694b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -157,7 +157,7 @@ int calculate_normal_threshold(struct zone *zone)
/*
* Refresh the thresholds for each zone.
*/
-static void refresh_zone_stat_thresholds(void)
+void refresh_zone_stat_thresholds(void)
{
struct zone *zone;
int cpu;
@@ -321,9 +321,12 @@ static inline void mod_state(struct zone *zone,
/*
* The fetching of the stat_threshold is racy. We may apply
* a counter threshold to the wrong the cpu if we get
- * rescheduled while executing here. However, the following
- * will apply the threshold again and therefore bring the
- * counter under the threshold.
+ * rescheduled while executing here. However, the next
+ * counter update will apply the threshold again and
+ * therefore bring the counter under the threshold again.
+ *
+ * Most of the time the thresholds are the same anyways
+ * for all cpus in a zone.
*/
t = this_cpu_read(pcp->stat_threshold);
@@ -500,8 +503,12 @@ void refresh_cpu_vm_stats(int cpu)
* z = the zone from which the allocation occurred.
*
* Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
*/
-void zone_statistics(struct zone *preferred_zone, struct zone *z)
+void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
{
if (z->zone_pgdat == preferred_zone->zone_pgdat) {
__inc_zone_state(z, NUMA_HIT);
@@ -509,7 +516,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(preferred_zone, NUMA_FOREIGN);
}
- if (z->node == numa_node_id())
+ if (z->node == ((flags & __GFP_OTHER_NODE) ?
+ preferred_zone->node : numa_node_id()))
__inc_zone_state(z, NUMA_LOCAL);
else
__inc_zone_state(z, NUMA_OTHER);
@@ -651,6 +659,138 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
}
#endif
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS)
+#ifdef CONFIG_ZONE_DMA
+#define TEXT_FOR_DMA(xx) xx "_dma",
+#else
+#define TEXT_FOR_DMA(xx)
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#else
+#define TEXT_FOR_DMA32(xx)
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#else
+#define TEXT_FOR_HIGHMEM(xx)
+#endif
+
+#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
+ TEXT_FOR_HIGHMEM(xx) xx "_movable",
+
+const char * const vmstat_text[] = {
+ /* Zoned VM counters */
+ "nr_free_pages",
+ "nr_inactive_anon",
+ "nr_active_anon",
+ "nr_inactive_file",
+ "nr_active_file",
+ "nr_unevictable",
+ "nr_mlock",
+ "nr_anon_pages",
+ "nr_mapped",
+ "nr_file_pages",
+ "nr_dirty",
+ "nr_writeback",
+ "nr_slab_reclaimable",
+ "nr_slab_unreclaimable",
+ "nr_page_table_pages",
+ "nr_kernel_stack",
+ "nr_unstable",
+ "nr_bounce",
+ "nr_vmscan_write",
+ "nr_writeback_temp",
+ "nr_isolated_anon",
+ "nr_isolated_file",
+ "nr_shmem",
+ "nr_dirtied",
+ "nr_written",
+
+#ifdef CONFIG_NUMA
+ "numa_hit",
+ "numa_miss",
+ "numa_foreign",
+ "numa_interleave",
+ "numa_local",
+ "numa_other",
+#endif
+ "nr_anon_transparent_hugepages",
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ "pgpgin",
+ "pgpgout",
+ "pswpin",
+ "pswpout",
+
+ TEXTS_FOR_ZONES("pgalloc")
+
+ "pgfree",
+ "pgactivate",
+ "pgdeactivate",
+
+ "pgfault",
+ "pgmajfault",
+
+ TEXTS_FOR_ZONES("pgrefill")
+ TEXTS_FOR_ZONES("pgsteal")
+ TEXTS_FOR_ZONES("pgscan_kswapd")
+ TEXTS_FOR_ZONES("pgscan_direct")
+
+#ifdef CONFIG_NUMA
+ "zone_reclaim_failed",
+#endif
+ "pginodesteal",
+ "slabs_scanned",
+ "kswapd_steal",
+ "kswapd_inodesteal",
+ "kswapd_low_wmark_hit_quickly",
+ "kswapd_high_wmark_hit_quickly",
+ "kswapd_skip_congestion_wait",
+ "pageoutrun",
+ "allocstall",
+
+ "pgrotated",
+
+#ifdef CONFIG_COMPACTION
+ "compact_blocks_moved",
+ "compact_pages_moved",
+ "compact_pagemigrate_failed",
+ "compact_stall",
+ "compact_fail",
+ "compact_success",
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+ "htlb_buddy_alloc_success",
+ "htlb_buddy_alloc_fail",
+#endif
+ "unevictable_pgs_culled",
+ "unevictable_pgs_scanned",
+ "unevictable_pgs_rescued",
+ "unevictable_pgs_mlocked",
+ "unevictable_pgs_munlocked",
+ "unevictable_pgs_cleared",
+ "unevictable_pgs_stranded",
+ "unevictable_pgs_mlockfreed",
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "thp_fault_alloc",
+ "thp_fault_fallback",
+ "thp_collapse_alloc",
+ "thp_collapse_alloc_failed",
+ "thp_split",
+#endif
+
+#endif /* CONFIG_VM_EVENTS_COUNTERS */
+};
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */
+
+
#ifdef CONFIG_PROC_FS
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
@@ -823,126 +963,6 @@ static const struct file_operations pagetypeinfo_file_ops = {
.release = seq_release,
};
-#ifdef CONFIG_ZONE_DMA
-#define TEXT_FOR_DMA(xx) xx "_dma",
-#else
-#define TEXT_FOR_DMA(xx)
-#endif
-
-#ifdef CONFIG_ZONE_DMA32
-#define TEXT_FOR_DMA32(xx) xx "_dma32",
-#else
-#define TEXT_FOR_DMA32(xx)
-#endif
-
-#ifdef CONFIG_HIGHMEM
-#define TEXT_FOR_HIGHMEM(xx) xx "_high",
-#else
-#define TEXT_FOR_HIGHMEM(xx)
-#endif
-
-#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx) xx "_movable",
-
-static const char * const vmstat_text[] = {
- /* Zoned VM counters */
- "nr_free_pages",
- "nr_inactive_anon",
- "nr_active_anon",
- "nr_inactive_file",
- "nr_active_file",
- "nr_unevictable",
- "nr_mlock",
- "nr_anon_pages",
- "nr_mapped",
- "nr_file_pages",
- "nr_dirty",
- "nr_writeback",
- "nr_slab_reclaimable",
- "nr_slab_unreclaimable",
- "nr_page_table_pages",
- "nr_kernel_stack",
- "nr_unstable",
- "nr_bounce",
- "nr_vmscan_write",
- "nr_writeback_temp",
- "nr_isolated_anon",
- "nr_isolated_file",
- "nr_shmem",
- "nr_dirtied",
- "nr_written",
-
-#ifdef CONFIG_NUMA
- "numa_hit",
- "numa_miss",
- "numa_foreign",
- "numa_interleave",
- "numa_local",
- "numa_other",
-#endif
- "nr_anon_transparent_hugepages",
- "nr_dirty_threshold",
- "nr_dirty_background_threshold",
-
-#ifdef CONFIG_VM_EVENT_COUNTERS
- "pgpgin",
- "pgpgout",
- "pswpin",
- "pswpout",
-
- TEXTS_FOR_ZONES("pgalloc")
-
- "pgfree",
- "pgactivate",
- "pgdeactivate",
-
- "pgfault",
- "pgmajfault",
-
- TEXTS_FOR_ZONES("pgrefill")
- TEXTS_FOR_ZONES("pgsteal")
- TEXTS_FOR_ZONES("pgscan_kswapd")
- TEXTS_FOR_ZONES("pgscan_direct")
-
-#ifdef CONFIG_NUMA
- "zone_reclaim_failed",
-#endif
- "pginodesteal",
- "slabs_scanned",
- "kswapd_steal",
- "kswapd_inodesteal",
- "kswapd_low_wmark_hit_quickly",
- "kswapd_high_wmark_hit_quickly",
- "kswapd_skip_congestion_wait",
- "pageoutrun",
- "allocstall",
-
- "pgrotated",
-
-#ifdef CONFIG_COMPACTION
- "compact_blocks_moved",
- "compact_pages_moved",
- "compact_pagemigrate_failed",
- "compact_stall",
- "compact_fail",
- "compact_success",
-#endif
-
-#ifdef CONFIG_HUGETLB_PAGE
- "htlb_buddy_alloc_success",
- "htlb_buddy_alloc_fail",
-#endif
- "unevictable_pgs_culled",
- "unevictable_pgs_scanned",
- "unevictable_pgs_rescued",
- "unevictable_pgs_mlocked",
- "unevictable_pgs_munlocked",
- "unevictable_pgs_cleared",
- "unevictable_pgs_stranded",
- "unevictable_pgs_mlockfreed",
-#endif
-};
-
static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -1181,7 +1201,6 @@ static int __init setup_vmstat(void)
#ifdef CONFIG_SMP
int cpu;
- refresh_zone_stat_thresholds();
register_cpu_notifier(&vmstat_notifier);
for_each_online_cpu(cpu)